In [8]:
import pandas as pd
import json

with open('games_ncaa/all_games_2021-2023.json', 'r') as file:
    games = json.load(file)

df_games_ncaa = pd.DataFrame(games)

In [9]:
df_games_ncaa

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,2023-08-24,UAB,Northern Ky.,1,1
1,2023-08-24,Lindenwood,DePaul,0,1
2,2023-08-24,Wright St.,Xavier,1,1
3,2023-08-24,Mercer,Cal St. Fullerton,1,6
4,2023-08-24,Canisius,Saint Francis,1,2
...,...,...,...,...,...
5707,2021-12-04,Saint Louis,Washington,0,2
5708,2021-12-04,Clemson,Oregon St.,1,1
5709,2021-12-10,Clemson,Notre Dame,1,1
5710,2021-12-10,Georgetown,Washington,1,2


In [10]:
with open('games_wyscout/all_games.json', 'r') as file:
    games = json.load(file)

df_games_wyscout = pd.DataFrame(games)

In [11]:
df_games_wyscout['match_date'] = pd.to_datetime(df_games_wyscout['match_date']).dt.strftime('%Y-%m-%d')
df_games_wyscout = df_games_wyscout.sort_values(by='match_date')
df_games_wyscout

Unnamed: 0,wyId,match_date,home_team,away_team,home_score,away_score
0,5193390,2021-01-23,Jacksonville Dolphins,Southeastern Fire,0,0
24,5193770,2021-01-23,Oral Roberts Golden Eagles,Oklahoma City Stars,0,0
1,5193391,2021-01-27,Jacksonville Dolphins,Flagler Saints,3,1
615,5200724,2021-01-28,Grand Canyon Lopes,Benedictine University At Mesa Redhawks,9,0
25,5193772,2021-01-29,Oral Roberts Golden Eagles,Northeastern State River Hawks,2,1
...,...,...,...,...,...,...
6412,5510850,2023-11-12,LMU Lions,St. Mary's College of CA Gaels,0,1
5994,5501578,2023-11-12,Stanford Cardinal,Berkeley Golden Bears,2,1
6414,5510852,2023-11-12,Santa Clara Broncos,Pacific Tigers,2,1
6884,5542066,2023-11-12,VCU Rams,Dayton Flyers,1,2


In [12]:
teams_ncaa = pd.concat([df_games_ncaa['home_team'], df_games_ncaa['away_team']]).unique().tolist()
print(teams_ncaa)

['UAB', 'Lindenwood', 'Wright St.', 'Mercer', 'Canisius', 'UC Davis', 'UCLA', 'VMI', 'Rhode Island', 'La Salle', 'Merrimack', 'Northeastern', 'Binghamton', 'St. Thomas (MN)', 'Iona', 'Manhattan', 'Southern Ind.', 'UChicago', 'Navy', 'Creighton', 'Western Mich.', 'LMU (CA)', 'LIU', 'Rider', 'San Diego', 'Kansas City', 'Rutgers', 'Penn St.', 'CSUN', 'Loyola Chicago', 'High Point', 'Niagara', 'Winthrop', 'Bucknell', 'Temple', 'American', 'Grand Canyon', 'UCF', 'Marist', 'FDU', 'Cleveland St.', 'UMBC', "Saint Peter's", 'George Washington', 'Sacred Heart', 'Central Conn. St.', 'Southern Wesleyan', 'Air Force', 'Ga. Southern', 'Providence', 'North Florida', 'Drexel', 'California', 'South Carolina', 'UAlbany', "St. John's (NY)", 'UConn', 'Oakland', 'FGCU', 'Tulsa', 'Georgia St.', 'Duquesne', 'Wofford', 'Loyola Maryland', 'Col. of Charleston', 'Siena', 'Bellarmine', 'Indiana', 'NJIT', 'Omaha', 'California Baptist', 'Memphis', 'Purdue Fort Wayne', 'Detroit Mercy', 'FIU', 'New Hampshire', 'St. B

In [13]:
teams_wyscout = pd.concat([df_games_wyscout['home_team'], df_games_wyscout['away_team']]).unique().tolist()
print(teams_wyscout)

['Jacksonville Dolphins', 'Oral Roberts Golden Eagles', 'Grand Canyon Lopes', 'Davidson Wildcats', 'Charlotte 49ers', 'Portland Pilots', 'FAU Owls', 'Central Arkansas Bears', 'Marshall Thundering Herd', 'Pacific Tigers Hawaii', 'Oregon State Beavers', 'Stetson Hatters', 'Bradley Braves', 'Seattle Redhawks', 'Saint Louis Billikens', 'Marquette Golden Eagles', 'Wright State Raiders', 'Oakland Golden Grizzlies', 'NKU Norse', 'UCLA Bruins', 'Cleveland State Vikings', 'IPFW Mastodons', 'Kentucky Wildcats', 'Butler Bulldogs', 'ETSU Buccaneers', 'Washington Huskies', 'Lipscomb Bisons', 'UNCW Seahawks', 'UCF Knights University', 'USF Bulls', 'Memphis Tigers University', 'HBU Huskies', 'Liberty Flames', 'Dixie State', 'Northeastern State River Hawks', 'VCU Rams', 'UCONN Huskies University', 'UNLV Rebels', 'South Carolina Gamecocks', 'FIU Panthers', "The Master's Mustangs", 'Robert Morris Colonials', 'Detroit Mercy Titans', 'UMASS Lowell River Hawks', 'George Washington Colonials', 'IUPUI Jaguar

In [None]:
'''
Use Leveinstein dist. to find team matchings from NCAA->Wyscout
'''

from fuzzywuzzy import process, fuzz

ncaa_rename_dict = {}

years = ['2021', '2022', '2023']

for year in years:
    # Filter dataframes by year
    df_ncaa_year = df_games_ncaa[df_games_ncaa['date'].str.startswith(year)]
    df_wyscout_year = df_games_wyscout[df_games_wyscout['match_date'].str.startswith(year)]
    
    # Get unique teams for the current year
    teams_ncaa_year = pd.concat([df_ncaa_year['home_team'].explode(), df_ncaa_year['away_team'].explode()]).unique().tolist()
    teams_wyscout_year = pd.concat([df_wyscout_year['home_team'], df_wyscout_year['away_team']]).unique().tolist()
    
    for team in teams_ncaa_year:
        matches = process.extract(team, teams_wyscout_year, limit=3, scorer=fuzz.partial_ratio)
        if team not in ncaa_rename_dict:
            ncaa_rename_dict[team] = []
        ncaa_rename_dict[team].extend([match[0] for match in matches if match[1] > 80])
        if not ncaa_rename_dict[team] and matches:
            ncaa_rename_dict[team].append(matches[0][0])


In [15]:
for key, value in ncaa_rename_dict.items():
    print(f"{key}: {value}")

Northern Ky.: ['Northern Illinois Huskies', 'Northern Illinois Huskies', 'Northern Illinois Huskies']
VMI: ['VMI Keydets', 'VMI Keydets', 'VMI Keydets']
Northeastern: ['Northeastern State River Hawks', 'Northeastern Huskies', 'Northwestern Wildcats', 'Northeastern Huskies', 'Northwestern Wildcats', 'Northeastern Huskies', 'Northwestern Wildcats']
Vermont: ['Vermont Catamouts', 'Vermont Catamouts', 'Vermont Catamouts']
Lafayette: ['Lafayette Leopards', 'Lafayette Leopards', 'Lafayette Leopards']
La Salle: ['La Salle Explorers', 'La Salle Explorers', 'La Salle Explorers']
Purdue Fort Wayne: ['Syracuse Orange']
UMass Lowell: ['UMASS Lowell River Hawks', 'UMASS Lowell River Hawks', 'UMASS Lowell River Hawks']
West Virginia: ['West Virginia Mountaineers', 'West Virginia Tech', 'West Virginia Mountaineers', 'West Virginia Mountaineers']
Furman: ['Furman Paladins', 'Furman Paladins', 'Furman Paladins']
Loyola Chicago: ['Loyola Chicago Ramblers', 'Loyola Chicago Ramblers', 'Loyola Chicago Ramb

In [None]:
# Manual Team matchings

ncaa_rename_dict['UChicago'] = ['Chicago Maroons']
ncaa_rename_dict['LMU (CA)'] = ['Loyola Marymount LMU Lions']
ncaa_rename_dict['Kansas City'] = ['UMKC Kangaroos']
ncaa_rename_dict['Central Conn. St.'] = ['CCSU Blue Devils']
ncaa_rename_dict['Col. of Charleston'] = ['Charleston Cougars', 'Charleston Golden Eagles']
ncaa_rename_dict['Purdue Fort Wayne'] = ['Purdue Fort Wayne Mastodons', 'IPFW Mastodons']
ncaa_rename_dict['Western Ill.'] = ['Western Illinois Leathernecks']
ncaa_rename_dict['Fla. Atlantic'] = ['Florida Atlantic Owls', 'FAU Owls']
ncaa_rename_dict['South Fla'] = ['South Florida Bulls', 'USF Bulls']
ncaa_rename_dict['Houston Christian'] = ['HCU Huskies']
ncaa_rename_dict['James Madison'] = ['JMU Dukes']
ncaa_rename_dict['NIU'] = ['Northern Illinois Huskies']
ncaa_rename_dict['Massachusetts'] = ['UMass Minutemen', 'UMASS Lowell River Hawks']
ncaa_rename_dict['SUNY Maritime'] = ['Maritime Privateers']
ncaa_rename_dict['Northern Ky.'] = ['Northern Kentucky Norse', 'NKU Norse']
ncaa_rename_dict['Duke'] = ['Duke Blue Devils', 'Duke Blue Devils University', 'JMU Dukes', 'Duquesne Dukes']



In [None]:
# GPT Generated Pairings from NCAA to Wyscout (2023)

gpt_generated_pairings = {
    "Adrian": "Adrian Bulldogs",
    "Air Force": "Air Force Falcons",
    "Akron": "Akron Zips",
    "American": "American Eagles",
    "Army West Point": "Army West Point Black Knights",
    "Assumption": "Assumption Greyhounds",
    "Aurora": "Aurora Spartans",
    "Averett": "Averett Cougars",
    "Bellarmine": "Bellarmine Knights",
    "Belmont": "Belmont Bruins",
    "Belmont Abbey": "Belmont Abbey Crusaders",
    "Binghamton": "Binghamton BearCats",
    "Blackburn": "(no pairing found)",
    "Boston College": "Boston College Eagles",
    "Boston U.": "Boston Terriers",
    "Bowling Green": "Bowling Green Falcons",
    "Bradley": "Bradley Braves",
    "Brandeis": "(no pairing found)",
    "Brown": "Brown Bears",
    "Bryant": "Bryant Bulldogs",
    "Bucknell": "Bucknell Bison",
    "Butler": "Butler Bulldogs",
    "CSU Bakersfield": "CSU Bakersfield Roadrunners",
    "CSUN": "CSUN Matadors",
    "Cal Poly": "Cal Poly Mustangs",
    "Cal St. Fullerton": "Cal State Fullerton Titans",
    "California": "(no pairing found)",
    "California Baptist": "California Baptist Lancers",
    "Campbell": "Campbell Camels",
    "Canisius": "Canisius Golden Griffins",
    "Central Ark.": "Central Arkansas Bears",
    "Central Conn. St.": "(no pairing found)",
    "Charlotte": "Charlotte 49ers",
    "Chicago St.": "Chicago State Cougars",
    "Clemson": "Clemson Tigers",
    "Cleveland St.": "Cleveland State Vikings",
    "Coastal Carolina": "Coastal Carolina Chanticleers",
    "Col. of Charleston": "Charleston Cougars",
    "Colgate": "Colgate Raiders",
    "Columbia": "Columbia Lions",
    "Cornell": "Cornell Big Red",
    "Creighton": "Creighton Bluejays",
    "Dartmouth": "Dartmouth Big Green",
    "Davidson": "Davidson Wildcats",
    "Dayton": "Dayton Flyers",
    "DePaul": "DePaul Blue Demons",
    "Delaware": "Delaware Blue Hens",
    "Denver": "Denver Pioneers",
    "Detroit Mercy": "Detroit Mercy Titans",
    "Drake": "Drake Bulldogs",
    "Drexel": "Drexel Dragons",
    "Duke": "Duke Blue Devils",
    "Duquesne": "Duquesne Duke",
    "ETSU": "ETSU Buccaneers",
    "Eastern Ill.": "Eastern Illinois Panthers",
    "Elon": "Elon Phoenix",
    "Embry-Riddle (FL)": "Embry Riddle Eagles",
    "Evansville": "Evansville Purple Aces",
    "FDU": "Fairleigh Dickinson Knights",
    "FGCU": "FGCU Eagles",
    "FIU": "FIU Panthers",
    "Fairfield": "Fairfield Stags",
    "Fla. Atlantic": "FAU Owls",
    "Fordham": "Fordham Rams",
    "Fresno Pacific": "Fresno Pacific Sunbird",
    "Furman": "Furman Paladins",
    "Ga. Southern": "Georgia Southern Eagles",
    "Gardner-Webb": "Gardner-Webb Bulldogs",
    "George Mason": "George Mason Patriots",
    "George Washington": "George Washington Colonials",
    "Georgetown": "Georgetown Hoyas",
    "Georgia St.": "Georgia State Panthers",
    "Gonzaga": "Gonzaga Bulldogs",
    "Grand Canyon": "Grand Canyon Lopes",
    "Green Bay": "Green Bay Phoenix",
    "Greensboro": "Greensboro Pride",
    "Hampden-Sydney": "Hampden-Sydney Tigers",
    "Harvard": "Harvard Crimson",
    "High Point": "High Point Panthers",
    "Hofstra": "Hofstra Pride",
    "Holy Cross": "Holy Cross Crusaders",
    "Houston Christian": "(no pairing found)",
    "Howard": "Howard Bison",
    "IU Indy": "(no pairing found)",
    "Indiana": "Indiana Hoosiers",
    "Iona": "Iona Gaels",
    "Jacksonville": "Jacksonville Dolphins",
    "James Madison": "JMU Dukes",
    "Kansas City": "(no pairing found)",
    "Kentucky": "Kentucky Wildcats",
    "LIU": "LIU Sharks",
    "LMU (CA)": "LMU Lions",
    "La Salle": "La Salle Explorers",
    "Lafayette": "Lafayette Leopards",
    "Le Moyne": "Le Moyne Dolphins",
    "Lehigh": "Lehigh Mountain Hawks",
    "Liberty": "Liberty Flames",
    "Lindenwood": "Lindenwood Lions",
    "Lipscomb": "Lipscomb Bisons",
    "Longwood": "Longwood Lancers",
    "Louisville": "Louisville Cardinals",
    "Loyola Chicago": "Loyola Chicago Ramblers",
    "Loyola Maryland": "Loyola Greyhounds",
    "Manhattan": "Manhattan Jaspers",
    "Marist": "Marist Red Foxes",
    "Marquette": "Marquette Golden Eagles",
    "Marshall": "Marshall Thundering Herd",
    "Mary Baldwin": "Mary Baldwin",
    "Maryland": "Maryland College Park Terrapins",
    "Massachusetts": "UMass Minutemen",
    "Memphis": "Memphis Tigers",
    "Mercer": "Mercer Bears",
    "Merrimack": "Merrimack Warriors",
    "Michigan": "Michigan Wolverines",
    "Michigan St.": "Michigan State Spartans",
    "Milwaukee": "Milwaukee Panthers",
    "Missouri St.": "Missouri State Bears",
    "Monmouth": "Monmouth Hawks",
    "Mount St. Mary's": "Mount St. Mary's Mountaineers",
    "NC State": "NC State Wolfpack",
    "NIU": "Northern Illinois Huskies",
    "NJIT": "NJIT Highlanders",
    "Navy": "Navy Midshipmen",
    "New Hampshire": "New Hampshire Wildcats",
    "Niagara": "Niagara Purple Eagles",
    "North Carolina": "North Carolina Heels",
    "North Florida": "North Florida Ospreyes",
    "Northeastern": "Northeastern Huskies",
    "Northern Ky.": "NKU Norse",
    "Northwestern": "Northwestern Wildcats",
    "Notre Dame": "Notre Dame Fighting Irish",
    "Oakland": "Oakland Golden Grizzlies",
    "Oberlin": "Oberlin Yeomen",
    "Ohio St.": "Ohio State Buckeyes",
    "Old Dominion": "Old Dominion Monarchs",
    "Omaha": "Omaha Mavericks",
    "Oral Roberts": "Oral Roberts Golden Eagles",
    "Oregon St.": "Oregon State Beavers",
    "Pacific": "Pacific Tigers",
    "Penn": "Penn Quakers",
    "Penn St.": "Penn State Nittany Lion",
    "Pittsburgh": "Pittsburgh Panthers",
    "Portland": "Portland Pilots",
    "Presbyterian": "Presbyterian BlueHose",
    "Princeton": "Princeton Tigers",
    "Providence": "Providence Friars",
    "Purdue Fort Wayne": "IPFW Mastodons",
    "Queens (NC)": "Queens Royals",
    "Quinnipiac": "Quinnipiac Bobcats",
    "Radford": "Radford Highlanders",
    "Rhode Island": "Rhode Island Rhody",
    "Rider": "Rider Broncs",
    "Robert Morris": "Robert Morris Colonials",
    "Rutgers": "Rutgers Scarlet Knights",
    "SIUE": "SIUE Cougars",
    "SMU": "SMU Mustangs",
    "SUNY Maritime": "(no pairing found)",
    "Sacramento St.": "Sacramento State Hornets",
    "Sacred Heart": "(no pairing found)",
    "Saint Francis": "(no pairing found)",
    "Saint Joseph's": "Saint Joseph's Hawks",
    "Saint Louis": "Saint Louis Billikens",
    "Saint Mary's (CA)": "St. Mary's College of CA Gaels",
    "Saint Peter's": "Saint Peter's Peacocks",
    "San Diego": "San Diego Toreros",
    "San Diego St.": "San Diego State Aztecs",
    "San Francisco": "San Francisco Dons",
    "San Jose St.": "San Jose State Spartans",
    "Santa Clara": "Santa Clara Broncos",
    "Seattle U": "Seattle Redhawks",
    "Seton Hall": "Seton Hall Pirates",
    "Siena": "Siena Saints",
    "South Carolina": "South Carolina Gamecocks",
    "South Fla.": "USF Bulls",
    "Southern Ind.": "Southern Indiana Screaming Eagles",
    "Southern Wesleyan": "Southern Wesleyan Warriors",
    "Southwest Baptist": "Southwest Baptist Bearcats",
    "St. Bonaventure": "St. Bonaventure Bonnies",
    "St. John's (NY)": "St John's Red Storm",
    "St. Joseph's (L.I.)": "Saint Joseph's Golden Eagles",
    "St. Thomas (MN)": "St. Thomas Tommies",
    "Stanford": "Stanford Cardinal",
    "Stetson": "Stetson Hatters",
    "Stonehill": "Stonehill College Skyhawks",
    "Stony Brook": "Stony Brook Seawolves",
    "Suffolk": "Suffolk Rams",
    "Syracuse": "Syracuse Orange",
    "Temple": "Temple Owls",
    "Trine": "Trine Thunder",
    "Trinity (TX)": "Trinity Tigers",
    "Tulsa": "Tulsa Hurricane",
    "UAB": "UAB Blazers",
    "UAlbany": "Albany Great Danes",
    "UC Davis": "UC Davis Aggies",
    "UC Irvine": "UC Irvine Anteaters",
    "UC Riverside": "UC Riverside Highlanders",
    "UCLA": "UCLA Bruins",
    "UMass Lowell": "UMass Lowell River Hawks",
    "UMKC": "UMKC Kangaroos",
    "UNC Asheville": "UNC Asheville Bulldogs",
    "UNC Greensboro": "UNC Greensboro Spartans",
    "UNC Wilmington": "UNC Wilmington Seahawks",
    "USC": "USC Trojans",
    "UTEP": "UTEP Miners",
    "Utah": "Utah Utes",
    "Utah Valley": "Utah Valley Wolverines",
    "Valparaiso": "Valparaiso Crusaders",
    "Vanderbilt": "Vanderbilt Commodores",
    "Vermont": "Vermont Catamounts",
    "Villanova": "Villanova Wildcats",
    "Virginia": "Virginia Cavaliers",
    "Virginia Tech": "Virginia Tech Hokies",
    "Wagner": "Wagner Seahawks",
    "Wake Forest": "Wake Forest Demon Deacons",
    "Washington": "Washington Huskies",
    "Washington St.": "Washington State Cougars",
    "West Virginia": "West Virginia Mountaineers",
    "Wichita St.": "Wichita State Shockers",
    "Wisconsin": "Wisconsin Badgers",
    "Wright St.": "Wright State Raiders",
    "Yale": "Yale Bulldogs"
}

for key, value in gpt_generated_pairings.items():
    if key in ncaa_rename_dict:
        ncaa_rename_dict[key].append(value)

In [18]:
for key, value in ncaa_rename_dict.items():
    print(f"{key}: {value}")

Northern Ky.: ['Northern Kentucky Norse', 'NKU Norse', 'NKU Norse']
VMI: ['VMI Keydets', 'VMI Keydets', 'VMI Keydets']
Northeastern: ['Northeastern State River Hawks', 'Northeastern Huskies', 'Northwestern Wildcats', 'Northeastern Huskies', 'Northwestern Wildcats', 'Northeastern Huskies', 'Northwestern Wildcats', 'Northeastern Huskies']
Vermont: ['Vermont Catamouts', 'Vermont Catamouts', 'Vermont Catamouts', 'Vermont Catamounts']
Lafayette: ['Lafayette Leopards', 'Lafayette Leopards', 'Lafayette Leopards', 'Lafayette Leopards']
La Salle: ['La Salle Explorers', 'La Salle Explorers', 'La Salle Explorers', 'La Salle Explorers']
Purdue Fort Wayne: ['Purdue Fort Wayne Mastodons', 'IPFW Mastodons', 'IPFW Mastodons']
UMass Lowell: ['UMASS Lowell River Hawks', 'UMASS Lowell River Hawks', 'UMASS Lowell River Hawks', 'UMass Lowell River Hawks']
West Virginia: ['West Virginia Mountaineers', 'West Virginia Tech', 'West Virginia Mountaineers', 'West Virginia Mountaineers', 'West Virginia Mountainee

In [None]:
# Rename teams in the NCAA dataframe to lists of all possible matches in wyscout data

df_games_ncaa['home_team'] = df_games_ncaa['home_team'].apply(lambda x: ncaa_rename_dict.get(x, [x]))
df_games_ncaa['away_team'] = df_games_ncaa['away_team'].apply(lambda x: ncaa_rename_dict.get(x, [x]))

In [20]:
df_games_ncaa

Unnamed: 0,date,home_team,away_team,home_score,away_score
0,2023-08-24,"[UAB Blazers, UAB Blazers, UAB Blazers, UAB Bl...","[Northern Kentucky Norse, NKU Norse, NKU Norse]",1,1
1,2023-08-24,"[Lindenwood Lions, Lindenwood Lions, Lindenwoo...","[DePaul Blue Demons, DePaul Blue Demons, DePau...",0,1
2,2023-08-24,"[Wright State Raiders, Wright State Raiders, W...","[Xavier Musketeers, Xavier Musketeers, Xavier ...",1,1
3,2023-08-24,"[Mercer Bears, Mercer Bears, Mercer Bears, Fra...","[Cal State Fullerton Titans, Cal State Fullert...",1,6
4,2023-08-24,"[Canisius Golden Griffins, Canisius Golden Gri...","[Saint Francis Red Flash, Saint Francis Cougar...",1,2
...,...,...,...,...,...
5707,2021-12-04,"[Saint Louis Billikens, Saint Louis Billikens,...","[Washington Huskies, George Washington Colonia...",0,2
5708,2021-12-04,"[Clemson Tigers, Clemson Tigers, Clemson Tiger...","[Oregon State Beavers, Oregon State Beavers, O...",1,1
5709,2021-12-10,"[Clemson Tigers, Clemson Tigers, Clemson Tiger...","[Notre Dame Fighting Irish University, Notre D...",1,1
5710,2021-12-10,"[Georgetown Hoyas, Georgetown Hoyas, Georgetow...","[Washington Huskies, George Washington Colonia...",1,2


In [21]:
df_games_wyscout

Unnamed: 0,wyId,match_date,home_team,away_team,home_score,away_score
0,5193390,2021-01-23,Jacksonville Dolphins,Southeastern Fire,0,0
24,5193770,2021-01-23,Oral Roberts Golden Eagles,Oklahoma City Stars,0,0
1,5193391,2021-01-27,Jacksonville Dolphins,Flagler Saints,3,1
615,5200724,2021-01-28,Grand Canyon Lopes,Benedictine University At Mesa Redhawks,9,0
25,5193772,2021-01-29,Oral Roberts Golden Eagles,Northeastern State River Hawks,2,1
...,...,...,...,...,...,...
6412,5510850,2023-11-12,LMU Lions,St. Mary's College of CA Gaels,0,1
5994,5501578,2023-11-12,Stanford Cardinal,Berkeley Golden Bears,2,1
6414,5510852,2023-11-12,Santa Clara Broncos,Pacific Tigers,2,1
6884,5542066,2023-11-12,VCU Rams,Dayton Flyers,1,2


In [None]:
from datetime import datetime, timedelta

with open('unmatched_games.txt', 'w') as file:
    # Get unique years from the match_date column
    years = df_games_ncaa['date'].apply(lambda x: x.split('-')[0]).unique()
    
    for year in years:
        # Filter dataframes by year
        df_ncaa_year = df_games_ncaa[df_games_ncaa['date'].str.startswith(year)]
        df_wyscout_year = df_games_wyscout[df_games_wyscout['match_date'].str.startswith(year)]
        
        # Loop through each team in the filtered df_games_ncaa
        for team in df_ncaa_year['home_team'].explode().unique():
            # Create separate dataframes for games with the team present in df_ncaa_year and df_wyscout_year
            df_ncaa_team_games = df_ncaa_year[df_ncaa_year['home_team'].apply(lambda x: team in x) | df_ncaa_year['away_team'].apply(lambda x: team in x)]
            df_wyscout_team_games = df_wyscout_year[(df_wyscout_year['home_team'] == team) | (df_wyscout_year['away_team'] == team)]
            
            # Compare their lengths (simple check to see if they have the same amount of games per year)
            if len(df_ncaa_team_games) == len(df_wyscout_team_games):
                print(f"Same length for {team} in {year}")
            else: # If diff length, check each game manually per year
                print(f"Different length for {team} in {year}")
                for idx, wyscout_game in df_wyscout_team_games.iterrows():
                    match_found = False
                    for _, ncaa_game in df_ncaa_team_games.iterrows():
                        if (any(ncaa_home_team in wyscout_game['home_team'] for ncaa_home_team in ncaa_game['home_team']) and
                            any(ncaa_away_team in wyscout_game['away_team'] for ncaa_away_team in ncaa_game['away_team']) and
                            ncaa_game['home_score'] == wyscout_game['home_score'] and
                            ncaa_game['away_score'] == wyscout_game['away_score']) or \
                        (any(ncaa_home_team in wyscout_game['away_team'] for ncaa_home_team in ncaa_game['home_team']) and
                            any(ncaa_away_team in wyscout_game['home_team'] for ncaa_away_team in ncaa_game['away_team']) and
                            ncaa_game['home_score'] == wyscout_game['away_score'] and
                            ncaa_game['away_score'] == wyscout_game['home_score']):
                            match_found = True
                            break
                    if not match_found:
                        # If no match found, check for matches within a 3-day window
                        wyscout_date = datetime.strptime(wyscout_game['match_date'], '%Y-%m-%d')
                        for _, ncaa_game in df_ncaa_team_games.iterrows():
                            ncaa_date = datetime.strptime(ncaa_game['date'], '%Y-%m-%d')
                            if (any(ncaa_home_team in wyscout_game['home_team'] for ncaa_home_team in ncaa_game['home_team']) or
                                any(ncaa_away_team in wyscout_game['away_team'] for ncaa_away_team in ncaa_game['away_team'])) and \
                               abs((wyscout_date - ncaa_date).days) <= 3 and \
                               (ncaa_game['home_score'] == wyscout_game['home_score'] or
                                ncaa_game['away_score'] == wyscout_game['away_score']):
                                match_found = True
                                break
                    if not match_found:
                        print(f"Game not found in NCAA for {team} in {year}: {wyscout_game.to_dict()}")
                        file.write(f"{wyscout_game['wyId']}, {wyscout_game['match_date']}, {wyscout_game['home_team']} vs {wyscout_game['away_team']}\n")

Same length for UAB Blazers in 2023
Same length for Lindenwood Lions in 2023
Different length for Wright State Raiders in 2023
Game not found in NCAA for Wright State Raiders in 2023: {'wyId': 5512550, 'match_date': '2023-08-11', 'home_team': 'Indiana Hoosiers', 'away_team': 'Wright State Raiders', 'home_score': '1', 'away_score': '0'}
Game not found in NCAA for Wright State Raiders in 2023: {'wyId': 5491719, 'match_date': '2023-09-02', 'home_team': 'Wright State Raiders', 'away_team': 'SFU Red Flash', 'home_score': '3', 'away_score': '3'}
Game not found in NCAA for Wright State Raiders in 2023: {'wyId': 5491640, 'match_date': '2023-09-09', 'home_team': 'IUPUI Jaguars', 'away_team': 'Wright State Raiders', 'home_score': '2', 'away_score': '0'}
Game not found in NCAA for Wright State Raiders in 2023: {'wyId': 5540920, 'match_date': '2023-11-05', 'home_team': 'IUPUI Jaguars', 'away_team': 'Wright State Raiders', 'home_score': '1', 'away_score': '0'}
Same length for Mercer Bears in 2023
D