In [17]:
import pandas as pd
import glob
import difflib
import sys
sys.path.append('../../lib/')
import audlutils as au
import numpy as np

In [25]:
user_names = pd.read_csv('../../data/players/username_playername_relation.csv',encoding = "ISO-8859-1")
all_team_names = user_names.Teamname.unique()
user_names['InRel']=True
user_names.head(1)

Unnamed: 0,id,Teamname,Tournament,Username,PlayerName,Year,InRel
0,0,Atlanta Hustle,AUDL 2015,A Olsen,Anders Olsen,2015,True


In [26]:
raw_list=[]
for year in range(2014,2020):
    for raw_file in glob.glob(f'../../data/raw/{year}/*csv'):
        teamname = raw_file.split('_')[-1].split('.')[0]
        teamname = difflib.get_close_matches(teamname, all_team_names)[0]
        temp_df = au.CSV2DataFrame(raw_file)
        temp_df['Year']=year
        temp_df['Tournament']=f'AUDL {year}'
        temp_df['Teamname']=teamname
        raw_list.append(temp_df)

audl = pd.concat(raw_list)
audl.head(1)

Unnamed: 0,Date/Time,Tournamemnt,Opponent,Point Elapsed Seconds,Line,Our Score - End of Point,Their Score - End of Point,Event Type,Action,Passer,...,End Area,End X,End Y,Distance Unit of Measure,Absolute Distance,Lateral Distance,Toward Our Goal Distance,Year,Tournament,Teamname
0,2014-04-27 15:00,AUDL,Rochester Dragons,11,O,1,0,Offense,Catch,Sam T,...,,,,,,,,2014,AUDL 2014,DC Breeze


In [27]:
audl.columns

Index(['Date/Time', 'Tournamemnt', 'Opponent', 'Point Elapsed Seconds', 'Line',
       'Our Score - End of Point', 'Their Score - End of Point', 'Event Type',
       'Action', 'Passer', 'Receiver', 'Defender', 'Hang Time (secs)',
       'Player 0', 'Player 1', 'Player 2', 'Player 3', 'Player 4', 'Player 5',
       'Player 6', 'Player 7', 'Player 8', 'Player 9', 'Player 10',
       'Player 11', 'Player 12', 'Player 13', 'Player 14', 'Player 15',
       'Player 16', 'Player 17', 'Player 18', 'Player 19', 'Player 20',
       'Player 21', 'Player 22', 'Player 23', 'Player 24', 'Player 25',
       'Player 26', 'Player 27', 'Elapsed Time (secs)', 'Begin Area',
       'Begin X', 'Begin Y', 'End Area', 'End X', 'End Y',
       'Distance Unit of Measure', 'Absolute Distance', 'Lateral Distance',
       'Toward Our Goal Distance', 'Year', 'Tournament', 'Teamname'],
      dtype='object')

In [28]:
numbered_player_fields = [f'Player {i}' for i in range(0,28)]
player_fields = ['Passer', 'Receiver', 'Defender'] + numbered_player_fields

In [35]:
ugb = audl.groupby(['Teamname',
                  'Tournament',
                  'Year']).apply(lambda x : pd.DataFrame([{'Username':p} for p in np.unique([ plyr for field in player_fields 
                                                                                                   for plyr in x[field].unique() ])]) )

ugb = ugb.reset_index()
ugb['InRaw'] =True
merged = pd.merge(user_names,
                     ugb,
                     on = ['Teamname',
                              'Tournament',
                              'Year','Username'],
                     how='outer')

merged = merged[~merged.Username.isin(['','Anonymous'])]

In [50]:
actives = pd.read_csv('../../data/teams/active_rosters.csv')
actives.head(1)
all_actives = actives.groupby('Team').apply(lambda x : pd.Series({'AllActives':'; '.join(x.Actives).replace('**','')})).reset_index()
all_actives_dict = pd.Series(all_actives.AllActives.values,index=all_actives.Team.values).to_dict()
all_actives_dict['Atlanta Hustle']

'Sam Batson; Sun Choi; Karl Ekwurtzel; Brett Hulsmeyer; Elijah Jaime; Matthew Knowles; Paul Lally; Will Lindquist; Mac McClellan; Taylor Minch; Javier Ortiz; Player Pierce; Devon Rogers; Matt Smith; Trenton Spinks; Austin Taylor; Alex Trautman; Josh Turner; Kelvin Williams; Joel Wooten; Zach Avello; Josh Bush; Sun Choi; Jason Crowe; Karl Ekwurtzel; Brett Hulsmeyer; Elijah Jaime; Matthew Knowles; Paul Lally; Mac McClellan; Christian Olsen; Player Pierce; Carter Rae; Devon Rogers; Matt Smith; Trenton Spinks; Austin Taylor; Alex Trautman; Josh Turner; Kelvin Williams'

In [53]:
difflib.get_close_matches('A Taylor',all_actives_dict.get('Atlanta Hustle','').split('; '))

['Austin Taylor', 'Austin Taylor', 'Taylor Minch']

In [75]:
'FioDan',\
difflib.SequenceMatcher(None, 'FioDan', 'Dan Fiorino').ratio(), \
ReverseIt('FioDan'), \
difflib.SequenceMatcher(None, ReverseIt('FioDan'), 'Dan Fiorino').ratio(),

('FioDan', 0.47058823529411764, 'Dan Fio', 0.7777777777777778)

In [93]:
merged['InRel'] = merged['InRel'].fillna(False)
merged['InRaw'] = merged['InRaw'].fillna(False)


# unmatched = merged[(merged.InRaw)&~merged.InRel].copy() # in username but not raw
unmatched = merged[(merged.Year==2019)].copy() # in username but not raw

# Guess on username
unmatched['Guesses1'] = unmatched.apply(lambda x : difflib.get_close_matches(x.Username,
                                                                             all_actives_dict.get(x.Teamname,'').split('; '),cutoff=.001  ) ,
                                        axis=1)
unmatched['Guess1'] = unmatched['Guesses1'].apply(lambda x : x[0] if len(x)>0 else '')
unmatched['GuessScore1'] = unmatched.apply(lambda x : difflib.SequenceMatcher(None, x.Username, x.Guess1).ratio(),axis=1)

# Guess on Username with capitalized segments reversed
import re
def ReverseIt(name):
    return ' '.join(re.findall('[A-Z][^A-Z]*',name)[::-1])
unmatched['Guesses2'] = unmatched.apply(lambda x : difflib.get_close_matches(ReverseIt(x.Username) ,
                                                                             all_actives_dict.get(x.Teamname,'').split('; '),cutoff=.001  ) ,
                                        axis=1)
unmatched['Guess2'] = unmatched['Guesses2'].apply(lambda x : x[0] if len(x)>0 else '')
unmatched['GuessScore2'] = unmatched.apply(lambda x : difflib.SequenceMatcher(None, ReverseIt(x.Username), x.Guess2).ratio(),
                                           axis=1)

# Guess on previous username matches
unmatched['Guesses3'] = unmatched.apply(lambda x : difflib.get_close_matches(x.Username ,
                                                                             user_names[(user_names.Teamname==x.Teamname)&(user_names.Username==x.Username)].PlayerName.dropna().values,cutoff=.001  ) ,
                                        axis=1)
unmatched['Guess3'] = unmatched['Guesses3'].apply(lambda x : x[0] if len(x)>0 else '')
unmatched['GuessScore3'] = unmatched.apply(lambda x : difflib.SequenceMatcher(None, x.Username, x.Guess3).ratio(),
                                           axis=1)
def GetBestGuess(row,thresh1=.51,thresh2=.51):
    """Get Best Guess"""
    if row['GuessScore1'] > thresh1:
        return row['Guess1']
    elif row['GuessScore2'] > thresh2:
        return row['Guess2']
    else:
        return row['Guess3']


unmatched['BestGuess'] = unmatched.apply(GetBestGuess,axis=1)
# unmatched.sort_values('GuessScore1')


In [88]:
merged['InRel'] = merged['InRel'].fillna(False)
merged['InRaw'] = merged['InRaw'].fillna(False)

# Guess on username
merged['Guesses1'] = merged.apply(lambda x : difflib.get_close_matches(x.Username,
                                                                             all_actives_dict.get(x.Teamname,'').split('; '),cutoff=.001  ) ,
                                        axis=1)
merged['Guess1'] = merged['Guesses1'].apply(lambda x : x[0] if len(x)>0 else '')
merged['GuessScore1'] = merged.apply(lambda x : difflib.SequenceMatcher(None, x.Username, x.Guess1).ratio(),axis=1)

# Guess on Username with capitalized segments reversed
import re
def ReverseIt(name):
    return ' '.join(re.findall('[A-Z][^A-Z]*',name)[::-1])
merged['Guesses2'] = merged.apply(lambda x : difflib.get_close_matches(ReverseIt(x.Username) ,
                                                                             all_actives_dict.get(x.Teamname,'').split('; '),cutoff=.001  ) ,
                                        axis=1)
merged['Guess2'] = merged['Guesses2'].apply(lambda x : x[0] if len(x)>0 else '')
merged['GuessScore2'] = merged.apply(lambda x : difflib.SequenceMatcher(None, ReverseIt(x.Username), x.Guess2).ratio(),
                                           axis=1)

# Guess on previous username matches
merged['Guesses3'] = merged.apply(lambda x : difflib.get_close_matches(x.Username ,
                                                                             user_names[(user_names.Teamname==x.Teamname)&(user_names.Username==x.Username)].PlayerName.dropna().values,cutoff=.001  ) ,
                                        axis=1)
merged['Guess3'] = merged['Guesses3'].apply(lambda x : x[0] if len(x)>0 else '')
merged['GuessScore3'] = merged.apply(lambda x : difflib.SequenceMatcher(None, x.Username, x.Guess3).ratio(),
                                           axis=1)




In [97]:
def GetBestGuess(row,thresh1=.51,thresh2=.51):
    """Get Best Guess"""
    if row['GuessScore1'] > thresh1:
        return row['Guess1']
    elif row['GuessScore2'] > thresh2:
        return row['Guess2']
    else:
        return row['Guess3']


merged['BestGuess'] = merged.apply(GetBestGuess,axis=1)

merged[pd.isnull(merged.PlayerName)&(merged.BestGuess=='')]

cut = pd.isnull(merged.PlayerName)&(merged.Year==2019)
merged.loc[cut,'PlayerName']= merged[cut].BestGuess
merged[cut]


Unnamed: 0,id,Teamname,Tournament,Username,PlayerName,Year,InRel,level_3,InRaw,Guesses1,Guess1,GuessScore1,Guesses2,Guess2,GuessScore2,Guesses3,Guess3,GuessScore3,BestGuess
3888,,Atlanta Hustle,AUDL 2019,A Taylor,Austin Taylor,2019,False,1.0,True,"[Austin Taylor, Austin Taylor, Taylor Minch]",Austin Taylor,0.761905,"[Taylor Minch, Austin Taylor, Austin Taylor]",Taylor Minch,0.666667,"[Austin Taylor, Austin Taylor, Austin Taylor]",Austin Taylor,0.761905,Austin Taylor
3890,,Atlanta Hustle,AUDL 2019,Avello,Zach Avello,2019,False,3.0,True,"[Zach Avello, Joel Wooten, Mac McClellan]",Zach Avello,0.705882,"[Zach Avello, Joel Wooten, Mac McClellan]",Zach Avello,0.705882,[],,0.000000,Zach Avello
3891,,Atlanta Hustle,AUDL 2019,Batson,Sam Batson,2019,False,4.0,True,"[Sam Batson, Jason Crowe, Taylor Minch]",Sam Batson,0.750000,"[Sam Batson, Jason Crowe, Taylor Minch]",Sam Batson,0.750000,[Sam Batson],Sam Batson,0.750000,Sam Batson
3892,,Atlanta Hustle,AUDL 2019,Brett H,Brett Hulsmeyer,2019,False,5.0,True,"[Brett Hulsmeyer, Brett Hulsmeyer, Trenton Spi...",Brett Hulsmeyer,0.636364,"[Brett Hulsmeyer, Brett Hulsmeyer, Trenton Spi...",Brett Hulsmeyer,0.521739,[],,0.000000,Brett Hulsmeyer
3893,,Atlanta Hustle,AUDL 2019,Bush,Josh Bush,2019,False,6.0,True,"[Josh Bush, Sun Choi, Sun Choi]",Josh Bush,0.615385,"[Josh Bush, Sun Choi, Sun Choi]",Josh Bush,0.615385,"[Josh Bush, Josh Bush]",Josh Bush,0.615385,Josh Bush
3894,,Atlanta Hustle,AUDL 2019,C Olsen,Christian Olsen,2019,False,7.0,True,"[Christian Olsen, Mac McClellan, Mac McClellan]",Christian Olsen,0.636364,"[Christian Olsen, Jason Crowe, Sun Choi]",Christian Olsen,0.434783,"[Christian Olsen, Christian Olsen, Christian O...",Christian Olsen,0.636364,Christian Olsen
3895,,Atlanta Hustle,AUDL 2019,Crowe,Jason Crowe,2019,False,8.0,True,"[Jason Crowe, Carter Rae, Karl Ekwurtzel]",Jason Crowe,0.625000,"[Jason Crowe, Carter Rae, Karl Ekwurtzel]",Jason Crowe,0.625000,[Jason Crowe],Jason Crowe,0.625000,Jason Crowe
3896,,Atlanta Hustle,AUDL 2019,E Jaime,Elijah Jaime,2019,False,9.0,True,"[Elijah Jaime, Elijah Jaime, Javier Ortiz]",Elijah Jaime,0.736842,"[Javier Ortiz, Elijah Jaime, Elijah Jaime]",Javier Ortiz,0.500000,[],,0.000000,Elijah Jaime
3897,,Atlanta Hustle,AUDL 2019,J Wooten,Joel Wooten,2019,False,10.0,True,"[Joel Wooten, Sam Batson, Josh Turner]",Joel Wooten,0.842105,"[Joel Wooten, Carter Rae, Jason Crowe]",Joel Wooten,0.600000,[],,0.000000,Joel Wooten
3898,,Atlanta Hustle,AUDL 2019,Karl E,Karl Ekwurtzel,2019,False,11.0,True,"[Karl Ekwurtzel, Karl Ekwurtzel, Paul Lally]",Karl Ekwurtzel,0.600000,"[Karl Ekwurtzel, Karl Ekwurtzel, Paul Lally]",Karl Ekwurtzel,0.476190,[Karl Ekwurtzel],Karl Ekwurtzel,0.600000,Karl Ekwurtzel


In [109]:
merged[['Teamname', 'Tournament', 
        'Username', 'PlayerName', 
        'Year']].reset_index().to_csv('../../data/supplemental/username_playername_relation_updated.csv')

In [102]:
len(merged),len(user_names),merged.columns,len(merged[['id', 'Teamname', 'Tournament', 'Username', 'PlayerName', 'Year']].drop_duplicates())

(4148,
 3879,
 Index(['id', 'Teamname', 'Tournament', 'Username', 'PlayerName', 'Year',
        'InRel', 'level_3', 'InRaw', 'Guesses1', 'Guess1', 'GuessScore1',
        'Guesses2', 'Guess2', 'GuessScore2', 'Guesses3', 'Guess3',
        'GuessScore3', 'BestGuess'],
       dtype='object'),
 4148)

In [103]:
merged[merged.PlayerName==''][['Teamname', 'Tournament', 'Username', 'PlayerName', 'Year']]

Unnamed: 0,id,Teamname,Tournament,Username,PlayerName,Year
3903,,Atlanta Hustle,AUDL 2019,Mac,,2019
3936,,Austin Sol,AUDL 2019,Mika C,,2019
3986,,DC Breeze,AUDL 2019,Dennis M,,2019
3990,,DC Breeze,AUDL 2019,Kenta K,,2019
3991,,DC Breeze,AUDL 2019,Kris H,,2019
4015,,Dallas Roughnecks,AUDL 2019,BRogers,,2019
4016,,Dallas Roughnecks,AUDL 2019,BStreet,,2019
4017,,Dallas Roughnecks,AUDL 2019,CHogg,,2019
4334,,Seattle Cascades,AUDL 2019,Edie,,2019
4335,,Seattle Cascades,AUDL 2019,Geertz,,2019


In [104]:
merged[pd.isnull(merged.PlayerName)][['Teamname', 'Tournament', 'Username', 'PlayerName', 'Year']]

Unnamed: 0,Teamname,Tournament,Username,PlayerName,Year
126,Atlanta Hustle,AUDL 2016,The Law,,2016
136,Atlanta Hustle,AUDL 2016,Walsh,,2016
243,Austin Sol,AUDL 2018,Unknown,,2018
324,Chicago Wildfire,AUDL 2018,Bruno,,2018
329,Chicago Wildfire,AUDL 2018,Chowder,,2018
333,Chicago Wildfire,AUDL 2015,DW,,2015
377,Chicago Wildfire,AUDL 2015,Kennedy,,2015
395,Chicago Wildfire,AUDL 2018,Nick,,2018
500,Cincinnati Revolution,AUDL 2015,Greg,,2015
727,Dallas Roughnecks,AUDL 2018,Blake,,2018


In [None]:
merged[['Teamname', 'Tournament', 'Username', 'PlayerName', 'Year']].to_csv('../../data/players/username_playername_relation_2.csv')