In [2]:
import json, re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qsl, urlencode, urlunparse
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import os





In [3]:
hrefs = pd.read_table('../../data_files/scraped_raw/hrefs.txt', header=None)

print(hrefs)

                                              0
0  https://sofifa.com/team/131439/san-diego-fc/


In [4]:
base_url = 'https://sofifa.com'


session = requests.Session()

API = 'jorXNk0XeOjcNxNdmsBHa9YXUKSwnSgMFChONEcLZh4UVsc12swTZjUE2rgfKdEgb1L7KEdEs84IhmobWb'

all_roster_urls = []

for url in hrefs.iloc[:, 0]:
    print(f"Processing URL: {url}")
    
    resp = session.get("https://scraping.narf.ai/api/v1/", params={'api_key': API, 
                                                                   'url': url,
                                                                   'js_scenario': json.dumps({
                                                                       'steps': [
                                                                           {'click': "select[id='select-version']"},
                                                                           {'wait': 1000}
                                                                           ]
                                                                       })})
    soup = BeautifulSoup(resp.text, 'html.parser')
    select_season = soup.find('select', {'id': 'select-version'})
    if not select_season:
        print(f"No season dropdown found for url {url}")
        continue
    
    if select_season:
        for season_option in select_season.find_all('option'):
            season_href = season_option.get('value')
            if not season_href:
                continue
            
            season_url = urljoin(base_url, season_href)

            discovery_payload = {
            'api_key': API, 'url': season_url,
            'js_scenario': json.dumps({'steps': [{'click': "select[name='roster']"}, {'wait': 1000}]})
        }
            
            roster_rep = session.get("https://scraping.narf.ai/api/v1/", params=discovery_payload)
            roster_soup = BeautifulSoup(roster_rep.text, 'html.parser')
            
            select_roster = roster_soup.find('select', {'id': 'select-roster'})
            if select_roster:
                for option in select_roster.find_all('option'):
                    url_part = option.get("value").split('?')[0]
                    all_roster_urls.append(url_part)
                    
    print(f"Found {len(all_roster_urls)} roster URLs for {url}")

Processing URL: https://sofifa.com/team/131439/san-diego-fc/
Found 10 roster URLs for https://sofifa.com/team/131439/san-diego-fc/


In [5]:
all_roster_urls = pd.DataFrame(all_roster_urls, columns=["url"])


In [None]:
all_roster_urls.to_csv('../../data_files/scraped_raw/all_roster_urls.csv', index=False, header=False)

In [8]:
print(all_roster_urls)

                    url
0  /team/131439/260010/
1  /team/131439/260009/
2  /team/131439/260008/
3  /team/131439/260007/
4  /team/131439/260006/
5  /team/131439/260005/
6  /team/131439/260004/
7  /team/131439/260003/
8  /team/131439/260002/
9  /team/131439/260001/


In [None]:
all_roster_urls = pd.read_csv('../../data_files/scraped_raw/all_roster_urls.csv', header=None, names=["url"])

COLS = [
"pi","ae","hi","wi","pf","oa","bo","bp","vl","wg","ta","cr","fi","he","sh","vo","ts",
"dr","cu","fr","lo","bl","to","ac","sp","ag","re","ba","tp","so","ju","st","ln","te",
"ar","in","po","vi","pe","cm","td","ma","sa","sl","tg","gd","gh","gc","gp","gr"
]
    
def safe_name(s: str) -> str:
    s = re.sub(r'[\\/:*?"<>|]+', '-', s)
    s = re.sub(r'\s+', '-', s).strip('-')
    return s

def add_columns_to_url(u: str, cols) -> str:
    pu = urlparse(u)
    pairs = parse_qsl(pu.query, keep_blank_values=True)
    pairs += [("showCol[]", c) for c in cols]   # key bit: repeated showCol[]
    return urlunparse(pu._replace(query=urlencode(pairs, doseq=True)))

session = requests.Session()
retry = Retry(total=4, backoff_factor=0.7, status_forcelist=[429, 500, 502, 503, 504], allowed_methods=["GET"])
session.mount("https://", HTTPAdapter(max_retries=retry))


API = 'jorXNk0XeOjcNxNdmsBHa9YXUKSwnSgMFChONEcLZh4UVsc12swTZjUE2rgfKdEgb1L7KEdEs84IhmobWb'
base_url = 'https://sofifa.com'

OUTPUT_DIR = "../../data_files/scraped_raw/players"
existing_files = set(os.listdir(OUTPUT_DIR))

for index, row in all_roster_urls.iterrows():
    rel = row['url']
    base = urljoin(base_url, rel)
    scrape_url = add_columns_to_url(base, COLS)
    
    print(f"Scraping URL: {scrape_url}")
    
    final_response = session.get(
        "https://scraping.narf.ai/api/v1/",
        params = {
        'api_key': API,
        'url': scrape_url
        },
        timeout=(10,60)
        )
    if final_response.status_code != 200 or not final_response.content:
        print("Bad response:", final_response.status_code, "for", scrape_url)
        continue

    final_soup = BeautifulSoup(final_response.content, 'html.parser')
    
    date = ""
    date_select = final_soup.find('select', id='select-roster')
    if date_select:
        sel = date_select.find('option', selected=True)
        if sel:
            date = sel.get_text(strip=True)
        
    date = safe_name(date)
    
    team = final_soup.find('h1').get_text()
    safe_team = team.replace(" ", "-").replace("\\", "-").replace(":", "-").strip()
    
    filename = f"{safe_team}-{date}.csv"

    if filename in existing_files:
        print(f"[skip] Already scraped: {filename}")
        continue

    file_path = os.path.join(OUTPUT_DIR, filename)
    
    table = final_soup.find('table')
    if table is None:
        print(f'No table found for: {scrape_url}')
        continue
    
    rows = table.find_all('tr')
    if not rows or not rows[0].find_all('th'):
        print(f'no headers found: {scrape_url}')
        continue

    headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
    data = []


    for row in rows[1:]:
        cols = [td.get_text(strip=True) for td in row.find_all('td')]
        if cols and len(cols) == len(headers):
            data.append(cols)

    if not data:
        print(f"No data rows found for: {scrape_url}")
        continue

    df = pd.DataFrame(data, columns=headers)
    df['date'] = date
    df['team'] = team

    df.to_csv(file_path, index=False)

Scraping URL: https://sofifa.com/team/131439/260010/?showCol%5B%5D=pi&showCol%5B%5D=ae&showCol%5B%5D=hi&showCol%5B%5D=wi&showCol%5B%5D=pf&showCol%5B%5D=oa&showCol%5B%5D=bo&showCol%5B%5D=bp&showCol%5B%5D=vl&showCol%5B%5D=wg&showCol%5B%5D=ta&showCol%5B%5D=cr&showCol%5B%5D=fi&showCol%5B%5D=he&showCol%5B%5D=sh&showCol%5B%5D=vo&showCol%5B%5D=ts&showCol%5B%5D=dr&showCol%5B%5D=cu&showCol%5B%5D=fr&showCol%5B%5D=lo&showCol%5B%5D=bl&showCol%5B%5D=to&showCol%5B%5D=ac&showCol%5B%5D=sp&showCol%5B%5D=ag&showCol%5B%5D=re&showCol%5B%5D=ba&showCol%5B%5D=tp&showCol%5B%5D=so&showCol%5B%5D=ju&showCol%5B%5D=st&showCol%5B%5D=ln&showCol%5B%5D=te&showCol%5B%5D=ar&showCol%5B%5D=in&showCol%5B%5D=po&showCol%5B%5D=vi&showCol%5B%5D=pe&showCol%5B%5D=cm&showCol%5B%5D=td&showCol%5B%5D=ma&showCol%5B%5D=sa&showCol%5B%5D=sl&showCol%5B%5D=tg&showCol%5B%5D=gd&showCol%5B%5D=gh&showCol%5B%5D=gc&showCol%5B%5D=gp&showCol%5B%5D=gr
Scraping URL: https://sofifa.com/team/131439/260009/?showCol%5B%5D=pi&showCol%5B%5D=ae&showCol%5B

In [None]:
"""
base_url = 'https://sofifa.com'

session = requests.Session()

API = 'jorXNk0XeOjcNxNdmsBHa9YXUKSwnSgMFChONEcLZh4UVsc12swTZjUE2rgfKdEgb1L7KEdEs84IhmobWb'

for url in hrefs.iloc[:, 0]:    
    resp = session.get("https://scraping.narf.ai/api/v1/", params={'api_key': API, 
                                                                   'url': url,
                                                                   'js_scenario': json.dumps({
                                                                       'steps': [
                                                                           {'click': "select[id='select-version']"},
                                                                           {'wait': 1000}
                                                                           ]
                                                                       })})
    soup = BeautifulSoup(resp.text, 'html.parser')
    
    m = re.search(r'/team/(\d+)', url)
    if not m:
        print(f'could not parse {url}')
        continue
    
    team_id = m.group(1)

    select_season = soup.find('select', {'id': 'select-version'})
    if not select_season:
        print(f"No season dropdown found for url {url}")
        continue
    
    
    for season_option in select_season.find_all('option'):
        season_href = season_option.get('value')
        if not season_href:
            continue
        
        season_url = urljoin(base_url, season_href)
        
        season_name = season_option.get_text(strip=True)
        print(f"Scraping season: {season_name}")

        season_resp = session.get("https://scraping.narf.ai/api/v1/", params={"api_key": API,
                                                                              "url": season_url,
                                                                              'js_scenario': json.dumps({
                                                                                  'steps': [
                                                                                      {'click': "select[name='roster']"},
                                                                                      {'wait': 1000}
                                                                                  ]
                                                                              })})
        
        season_soup = BeautifulSoup(season_resp.text, 'html.parser')
        
        
        select_roster = season_soup.find('select', {'id': 'select-roster'})
        if not select_roster:
            print(f"No roster dropdown for season {season_name}")
            continue

        for option in select_roster.find_all('option'):
            modified_value = option.get("value")

            initial_value = modified_value.split('?')[0]

            full_url = urljoin(base_url, initial_value)
            
            print(f"Scraping initial roster URL: {full_url}")
           
            date = option.get_text(strip=True)
            safe_date = date.replace("/", "-").replace(":", "-")

            try:
                r = session.get("https://scraping.narf.ai/api/v1/", params={"api_key": API,
                                                                            "url": full_url,
                                                                            'js_scenario': json.dumps({
                                                                                'steps': [
                                                                                    {'click': "div.choices[data-type='select-multiple']"},
                                                                                    {"wait_for": ".choices-list-dropdown"},
                                                                                    {"wait_for": ".choices-list-dropdown[aria-expanded='true']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='pi']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ae']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='hi']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='wi']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='pf']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='oa']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='bo']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='bp']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='vl']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='wg']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ta']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='cr']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='fi']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='he']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='sh']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='vo']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ts']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='dr']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='cu']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='fr']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='lo']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='bl']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='to']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ac']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='sp']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ag']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='re']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ba']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='tp']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='so']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ju']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='st']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ln']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='te']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ar']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='in']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='po']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='vi']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='pe']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='cm']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='td']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='ma']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='sa']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='sl']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='tg']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='gd']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='gh']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='gc']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='gp']"},
                                                                                    {'click': ".choices-list-dropdown [data-value='gr']"},
                                                                                    {'wait': 1000}
                                                                                ]
                                                                            })})
                roster_soup = BeautifulSoup(r.text, 'html.parser')
                
                print(roster_soup)

                table = roster_soup.find('table')
                if table is None:
                    print(f'No table found for: {full_url}')
                    continue

                rows = table.find_all('tr')
                if not rows or not rows[0].find_all('th'):
                    print(f'no headers found: {full_url}')
                    continue

                headers = [th.get_text(strip=True) for th in rows[0].find_all('th')]
                data = []
                team = roster_soup.find('h1').get_text()
                safe_team = team.replace("/", "-").replace("\\", "-").replace(":", "-").strip()

                for row in rows[1:]:
                    cols = [td.get_text(strip=True) for td in row.find_all('td')]
                    if cols and len(cols) == len(headers):
                        data.append(cols)

                if not data:
                    print(f"No data rows found for: {full_url}")
                    continue

                df = pd.DataFrame(data, columns=headers)
                df['date'] = date
                df['team'] = team

                file_path = f'../data/scraping/players/{safe_team}_{safe_date}.csv'

                df.to_csv(file_path, index=False)
            except Exception as e:
                print(f'failed to scrape {full_url}, {e}')
                continue
                """