# Tab 1: Network

In [51]:
import pandas as pd
import itertools

In [52]:
# Open data
df = pd.read_csv("data/tennis_data_cleaned.csv", index_col=0)
df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,pl1_flag,pl1_year_pro,pl1_weight,pl1_height,pl1_hand,pl2_flag,pl2_year_pro,pl2_weight,pl2_height,pl2_hand
0,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Kwon S.W.,...,KOR,2015.0,72.0,180.0,Right-Handed,JPN,2014.0,64.0,170.0,Left-Handed
1,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Monteiro T.,...,BRA,2011.0,78.0,183.0,Left-Handed,GER,2014.0,80.0,188.0,Right-Handed
2,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Djere L.,...,SRB,2013.0,80.0,185.0,Right-Handed,ESP,2011.0,76.0,180.0,Right-Handed
3,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Johnson S.,...,USA,2012.0,86.0,188.0,Right-Handed,AUS,2018.0,85.0,188.0,Right-Handed
4,1,Adelaide,Adelaide International 1,2022-01-04,ATP250,Outdoor,Hard,1st Round,3,Moutet C.,...,FRA,2016.0,71.0,175.0,Left-Handed,DEN,2020.0,77.0,188.0,Right-Handed


In [53]:
df.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'pl1_flag', 'pl1_year_pro',
       'pl1_weight', 'pl1_height', 'pl1_hand', 'pl2_flag', 'pl2_year_pro',
       'pl2_weight', 'pl2_height', 'pl2_hand'],
      dtype='object')

In [54]:
# Are the player characteristics unique for each player?

print("Are the player characteristics unique for each player?")

l_winner = ['pl1_flag', 'pl1_year_pro', 'pl1_weight', 'pl1_height', 'pl1_hand']
l_loser = ['pl2_flag', 'pl2_year_pro', 'pl2_weight', 'pl2_height', 'pl2_hand']

print("Winner:")
for charac in l_winner:
    result = pd.DataFrame(df.groupby(['Winner'])[charac].apply(list))[charac].apply(lambda x: all(element == x[0] for element in x)).unique()
    print(charac + ": " + str(result))

print("Loser:")
for charac in l_loser:
    result = pd.DataFrame(df.groupby(['Loser'])[charac].apply(list))[charac].apply(lambda x: all(element == x[0] for element in x)).unique()
    print(charac + ": " + str(result))

Are the player characteristics unique for each player?
Winner:
pl1_flag: [ True]
pl1_year_pro: [ True]
pl1_weight: [ True]
pl1_height: [ True]
pl1_hand: [ True]
Loser:
pl2_flag: [ True]
pl2_year_pro: [ True]
pl2_weight: [ True]
pl2_height: [ True]
pl2_hand: [ True]


In [55]:
# Construction of the dictionnary for NODES

## New column with Year of the match
df.Date = pd.to_datetime(df.Date)
df['Year'] = pd.DatetimeIndex(df['Date']).year

## Get the number of matches per player per year or for all years combined
df_nodes_year = pd.merge(pd.DataFrame(df.groupby(['Winner', 'Year']).size()).reset_index(),
                         pd.DataFrame(df.groupby(['Loser', 'Year']).size()).reset_index(), 
                         left_on=['Winner', 'Year'], right_on=['Loser', 'Year'], how='outer')

df_nodes_allyears = pd.merge(pd.DataFrame(df.groupby(['Winner']).size()).reset_index(),
                             pd.DataFrame(df.groupby(['Loser']).size()).reset_index(), 
                             left_on=['Winner'], right_on=['Loser'], how='outer')
df_nodes_allyears['Year'] = 'All'

df_nodes = pd.concat([df_nodes_year, df_nodes_allyears])
df_nodes['Winner'].update(df_nodes.pop('Loser'))
df_nodes.rename(columns={"0_x": "nb_won", "0_y": "nb_lost", "Winner": "Player"}, inplace=True)

## Get the proportions
df_nodes.nb_won.fillna(value=0, inplace=True)
df_nodes.nb_lost.fillna(value=0, inplace=True)
df_nodes['nb_matches'] = df_nodes.nb_won + df_nodes.nb_lost
df_nodes = pd.merge(df_nodes,
                    df_nodes.groupby(["Year"]).agg({"nb_matches" : "sum"}).rename(columns={"nb_matches": "nbtot_matches"}).reset_index(), 
                    left_on=['Year'], right_on=['Year'], how='outer')
df_nodes['fraction_matches'] = df_nodes.nb_matches / df_nodes.nbtot_matches

## Add player characteristics
df_nodes = pd.merge(df_nodes,
                    pd.concat([df[['Winner', 'pl1_flag', 'pl1_year_pro', 'pl1_weight', 'pl1_height', 'pl1_hand']]\
                               .rename(columns={'Winner':'Player', 'pl1_flag':'flag', 'pl1_year_pro':'year_pro', 'pl1_weight':'weight', 'pl1_height':'height', 'pl1_hand':'hand'}), 
                               df[['Loser', 'pl2_flag', 'pl2_year_pro', 'pl2_weight', 'pl2_height', 'pl2_hand']]\
                               .rename(columns={'Loser':'Player', 'pl2_flag':'flag', 'pl2_year_pro':'year_pro', 'pl2_weight':'weight', 'pl2_height':'height', 'pl2_hand':'hand'})])\
                    .drop_duplicates(subset=['Player']).reset_index(drop=True), 
                    left_on=['Player'], right_on=['Player'], how='outer')

## Convert nb_won, nb_lost, nb_matches, nbtot_matches and year_pro to int
df_nodes = df_nodes.astype({'nb_won': 'int', 'nb_lost': 'int', 'nb_matches': 'int', 'nbtot_matches': 'int', 'year_pro': 'int'})

## Add dictionnary of Year:fraction_matches


df_nodes = pd.merge(df_nodes,
                    pd.DataFrame(df_nodes.groupby('Player')[['Year', 'fraction_matches']].apply(lambda x: pd.Series(x['fraction_matches'].values,index=x['Year']).to_dict()))\
                    .rename(columns={0:'dict_year_fract'}), 
                    left_on=['Player'], right_on=['Player'], how='outer')

## Put into dict format for network construction
dict_nodes = df_nodes[['Player', 'flag', 'year_pro', 'weight', 'height', 'hand', 'dict_year_fract']].drop_duplicates(subset=['Player']).to_dict('records')

## Visualize
df_nodes

Unnamed: 0,Player,Year,nb_won,nb_lost,nb_matches,nbtot_matches,fraction_matches,flag,year_pro,weight,height,hand,dict_year_fract
0,Acasuso J.,2008,26,22,48,5084,0.009441,ARG,1999,86.0,191.0,Right-Handed,"{2008: 0.00944138473642801, 2009: 0.0078333970..."
1,Acasuso J.,2009,20,21,41,5234,0.007833,ARG,1999,86.0,191.0,Right-Handed,"{2008: 0.00944138473642801, 2009: 0.0078333970..."
2,Acasuso J.,2011,2,1,3,5270,0.000569,ARG,1999,86.0,191.0,Right-Handed,"{2008: 0.00944138473642801, 2009: 0.0078333970..."
3,Acasuso J.,2010,0,3,3,5260,0.000570,ARG,1999,86.0,191.0,Right-Handed,"{2008: 0.00944138473642801, 2009: 0.0078333970..."
4,Acasuso J.,All,48,47,95,70754,0.001343,ARG,1999,86.0,191.0,Right-Handed,"{2008: 0.00944138473642801, 2009: 0.0078333970..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,Langer N.,All,0,2,2,70754,0.000028,GER,2005,77.0,193.0,Right-Handed,"{2013: 0.000392156862745098, 'All': 2.82669531..."
4596,Salamanca C.,2013,0,1,1,5100,0.000196,COL,2001,83.0,196.0,Left-Handed,"{2013: 0.000196078431372549, 'All': 1.41334765..."
4597,Salamanca C.,All,0,1,1,70754,0.000014,COL,2001,83.0,196.0,Left-Handed,"{2013: 0.000196078431372549, 'All': 1.41334765..."
4598,Sekulic M.,2013,0,1,1,5100,0.000196,SWE,2008,73.0,178.0,Right-Handed,"{2013: 0.000196078431372549, 'All': 1.41334765..."


In [56]:
# Construction of the dictionnary for LINKS

## Dataframe with player 1 vs. player 2 and characteristics of the matches
df_links = pd.DataFrame(df.groupby(['Winner', 'Loser'])[['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'WRank', 'LRank', 'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L']]\
                        .agg(list)).reset_index()
df_links.rename(columns={'Winner':'pl1', 'Loser':'pl2'}, inplace=True)
df_links['Winner'] = df_links.apply(lambda x: [x['pl1'] for i in x['ATP']], axis=1)

## Group in unique combinations of players
# Cf. cellules suivantes où il y a un problème 
    
df_links

Unnamed: 0,pl1,pl2,ATP,Location,Tournament,Date,Series,Court,Surface,Round,...,W4,L4,W5,L5,Wsets,Lsets,Comment,B365W,B365L,Winner
0,Acasuso J.,Andreev I.,[11],[Buenos Aires],[Copa Telmex],[2008-02-22 00:00:00],[International],[Outdoor],[Clay],[Quarterfinals],...,[nan],[nan],[nan],[nan],[2.0],[1.0],[Completed],[1.61],[2.2],[Acasuso J.]
1,Acasuso J.,Bachinger M.,[58],[Stockholm],[Stockholm Open],[2008-10-07 00:00:00],[International],[Indoor],[Hard],[1st Round],...,[nan],[nan],[nan],[nan],[2.0],[1.0],[Completed],[1.36],[3.0],[Acasuso J.]
2,Acasuso J.,Bellucci T.,[8],[Vina del Mar],[Movistar Open],[2009-02-06 00:00:00],[ATP250],[Outdoor],[Clay],[2nd Round],...,[nan],[nan],[nan],[nan],[2.0],[0.0],[Completed],[2.37],[1.53],[Acasuso J.]
3,Acasuso J.,Berrer M.,[51],[New York],[US Open],[2008-08-25 00:00:00],[Grand Slam],[Outdoor],[Hard],[1st Round],...,[nan],[nan],[nan],[nan],[3.0],[0.0],[Completed],[1.61],[2.2],[Acasuso J.]
4,Acasuso J.,Canas G.,"[14, 55]","[Acapulco, Metz]","[Abierto Mexicano, Open de Moselle]","[2008-02-27 00:00:00, 2008-09-29 00:00:00]","[International Gold, International]","[Outdoor, Indoor]","[Clay, Hard]","[2nd Round, 1st Round]",...,"[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[2.0, 2.0]","[1.0, 1.0]","[Completed, Completed]","[1.8, 2.0]","[1.9, 1.72]","[Acasuso J., Acasuso J.]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23323,Zverev M.,Youzhny M.,"[35, 44, 63]","[Stuttgart, Atlanta, St. Petersburg]","[Mercedes Cup, BB&T Atlanta Open, St. Petersbu...","[2018-06-11 00:00:00, 2018-07-25 00:00:00, 200...","[ATP250, ATP250, International]","[Outdoor, Outdoor, Indoor]","[Grass, Hard, Carpet]","[1st Round, 2nd Round, 2nd Round]",...,"[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[2.0, 2.0, 2.0]","[0.0, 0.0, 0.0]","[Completed, Completed, Completed]","[1.53, 1.44, 2.75]","[2.37, 2.62, 1.4]","[Zverev M., Zverev M., Zverev M.]"
23324,Zverev M.,Zeballos H.,"[55, 51]","[Kuala Lumpur, Metz]","[Malaysian Open, Open de Moselle]","[2013-09-24 00:00:00, 2010-09-21 00:00:00]","[ATP250, ATP250]","[Indoor, Indoor]","[Hard, Hard]","[1st Round, 1st Round]",...,"[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[2.0, 2.0]","[0.0, 0.0]","[Completed, Completed]","[1.57, 1.72]","[2.25, 2.0]","[Zverev M., Zverev M.]"
23325,Zverev M.,Zhang Ze,[59],[Shanghai],[Shanghai Masters],[2016-10-10 00:00:00],[Masters 1000],[Outdoor],[Hard],[1st Round],...,[nan],[nan],[nan],[nan],[2.0],[0.0],[Completed],[1.28],[3.5],[Zverev M.]
23326,de Voest R.,Berrer M.,[19],[Indian Wells],[Pacific Life Open],[2008-03-14 00:00:00],[Masters],[Outdoor],[Hard],[1st Round],...,[nan],[nan],[nan],[nan],[2.0],[0.0],[Completed],[2.1],[1.66],[de Voest R.]


In [35]:
pairs = [tuple(item) for item in df_links[['pl1', 'pl2']].values.tolist()]
unique_pairs = set(tuple(sorted(l)) for l in pairs)
nonunique_pairs = list(set(pairs)^set(unique_pairs))

for nup in nonunique_pairs:
    for c in df_links.columns.drop(labels=['pl1','pl2']):
        ind = df_links[(df_links.pl1 == nup[0]) & (df_links.pl2 == nup[1])].index.values[0]
        df_links.at[ind, c] = list(list(itertools.chain(*list(df_links.loc[(df_links.pl1 == nup[0]) & (df_links.pl2 == nup[1]), c].append(df_links.loc[(df_links.pl1 == nup[1]) & (df_links.pl2 == nup[0]), c])))))
        ind_drop = df_links[(df_links.pl1 == nup[1]) & (df_links.pl2 == nup[0])].index.values[0]
        df_links = df_links.drop(ind_drop)


IndexError: index 0 is out of bounds for axis 0 with size 0

############ Tentative de comprendre le problème du unique_pairs ###################

In [None]:
pairs = [tuple(item) for item in df_links[['pl1', 'pl2']].values.tolist()]
unique_pairs = list(set(tuple(sorted(l)) for l in pairs))
nonunique_pairs = list(set(pairs) - set(unique_pairs))

for t in unique_pairs :
    if not t in pairs:
        print(t)

In [50]:
pairs = [('Santillan A.', 'Kravchuk K.'), ('Kravchuk K.', 'Santillan A.'), ('Cecchinato M.', 'Chardy J.'), ('Navarro-Pastor I.', 'Youzhny M.'), ('Karlovic I.', 'Llodra M.'), ('Matosevic M.', 'Monfils G.')]
[tuple(sorted(l)) for l in pairs]
set(tuple(sorted(l)) for l in pairs)

{('Cecchinato M.', 'Chardy J.'),
 ('Karlovic I.', 'Llodra M.'),
 ('Kravchuk K.', 'Santillan A.'),
 ('Matosevic M.', 'Monfils G.'),
 ('Navarro-Pastor I.', 'Youzhny M.')}

In [30]:
pairs = [('a M.','b D.'), ('c C.', 'd D.'), ('d D.','c C.')]
unique_pairs = list(set(tuple(sorted(l)) for l in pairs))
nonunique_pairs = list(set(pairs) - set(unique_pairs))

nonunique_pairs

[('d D.', 'c C.')]

In [44]:
pairs = [('Kravchuk K.', 'Santillan A.'), ('Cecchinato M.', 'Chardy J.'), ('Navarro-Pastor I.', 'Youzhny M.'), ('Karlovic I.', 'Llodra M.'), ('Matosevic M.', 'Monfils G.')]
unique_pairs = list(set(tuple(sorted(l)) for l in pairs))
nonunique_pairs = list(set(pairs) - set(unique_pairs))

for t in unique_pairs :
    if t not in pairs:
        print(t)

In [43]:
if ('Kravchuk K.', 'Santillan A.') in pairs :
    print(('Kravchuk K.', 'Santillan A.'))