# Tab 1: Network

In [82]:
import pandas as pd
import itertools
from collections import Counter
import ast
from datetime import date, datetime
import json

## Data

In [83]:
# Open data
df = pd.read_csv("data/tennis_data_cleaned.csv", index_col=0)
df.head()

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,pl1_flag,pl1_year_pro,pl1_weight,pl1_height,pl1_hand,pl2_flag,pl2_year_pro,pl2_weight,pl2_height,pl2_hand
0,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Kwon S.W.,...,KOR,2015.0,72.0,180.0,Right-Handed,JPN,2014.0,64.0,170.0,Left-Handed
1,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Monteiro T.,...,BRA,2011.0,78.0,183.0,Left-Handed,GER,2014.0,80.0,188.0,Right-Handed
2,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Djere L.,...,SRB,2013.0,80.0,185.0,Right-Handed,ESP,2011.0,76.0,180.0,Right-Handed
3,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Johnson S.,...,USA,2012.0,86.0,188.0,Right-Handed,AUS,2018.0,85.0,188.0,Right-Handed
4,1,Adelaide,Adelaide International 1,2022-01-04,ATP250,Outdoor,Hard,1st Round,3,Moutet C.,...,FRA,2016.0,71.0,175.0,Left-Handed,DEN,2020.0,77.0,188.0,Right-Handed


In [84]:
df.columns

Index(['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface',
       'Round', 'Best of', 'Winner', 'Loser', 'WRank', 'LRank', 'WPts', 'LPts',
       'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets',
       'Lsets', 'Comment', 'B365W', 'B365L', 'pl1_flag', 'pl1_year_pro',
       'pl1_weight', 'pl1_height', 'pl1_hand', 'pl2_flag', 'pl2_year_pro',
       'pl2_weight', 'pl2_height', 'pl2_hand'],
      dtype='object')

In [85]:
# Are the player characteristics unique for each player?

print("Are the player characteristics unique for each player?")

l_winner = ['pl1_flag', 'pl1_year_pro', 'pl1_weight', 'pl1_height', 'pl1_hand']
l_loser = ['pl2_flag', 'pl2_year_pro', 'pl2_weight', 'pl2_height', 'pl2_hand']

print("Winner:")
for charac in l_winner:
    result = pd.DataFrame(df.groupby(['Winner'])[charac].apply(list))[charac].apply(lambda x: all(element == x[0] for element in x)).unique()
    print(charac + ": " + str(result))

print("Loser:")
for charac in l_loser:
    result = pd.DataFrame(df.groupby(['Loser'])[charac].apply(list))[charac].apply(lambda x: all(element == x[0] for element in x)).unique()
    print(charac + ": " + str(result))

Are the player characteristics unique for each player?
Winner:
pl1_flag: [ True]
pl1_year_pro: [ True]
pl1_weight: [ True]
pl1_height: [ True]
pl1_hand: [ True]
Loser:
pl2_flag: [ True]
pl2_year_pro: [ True]
pl2_weight: [ True]
pl2_height: [ True]
pl2_hand: [ True]


## Construction of the dictionnary for NODES

In [86]:
# New column with Year of the match
df.Date = pd.to_datetime(df.Date)
df['Year'] = pd.DatetimeIndex(df['Date']).year

In [87]:
# Get the number of matches per player per year or for all years combined

df_nodes_year = pd.merge(pd.DataFrame(df.groupby(['Winner', 'Year']).size()).reset_index(),
                         pd.DataFrame(df.groupby(['Loser', 'Year']).size()).reset_index(), 
                         left_on=['Winner', 'Year'], right_on=['Loser', 'Year'], how='outer')

df_nodes_allyears = pd.merge(pd.DataFrame(df.groupby(['Winner']).size()).reset_index(),
                             pd.DataFrame(df.groupby(['Loser']).size()).reset_index(), 
                             left_on=['Winner'], right_on=['Loser'], how='outer')
df_nodes_allyears['Year'] = 2007 # /!\ 2007 is used instead of 'All' to avoid to have string and int

df_nodes = pd.concat([df_nodes_year, df_nodes_allyears])
df_nodes['Winner'].update(df_nodes.pop('Loser'))
df_nodes.rename(columns={"0_x": "nb_won", "0_y": "nb_lost", "Winner": "Player"}, inplace=True)

In [88]:
# Get the proportions
df_nodes.nb_won.fillna(value=0, inplace=True)
df_nodes.nb_lost.fillna(value=0, inplace=True)
df_nodes['nb_matches'] = df_nodes.nb_won + df_nodes.nb_lost
df_nodes = pd.merge(df_nodes,
                    df_nodes.groupby(["Year"]).agg({"nb_matches" : "sum"}).rename(columns={"nb_matches": "nbtot_matches"}).reset_index(), 
                    left_on=['Year'], right_on=['Year'], how='outer')
df_nodes['fraction_matches'] = df_nodes.nb_matches / df_nodes.nbtot_matches *10 # increase the size by a factor of 10

In [89]:
# Add player characteristics
df_nodes = pd.merge(df_nodes,
                    pd.concat([df[['Winner', 'pl1_flag', 'pl1_year_pro', 'pl1_weight', 'pl1_height', 'pl1_hand']]\
                               .rename(columns={'Winner':'Player', 'pl1_flag':'flag', 'pl1_year_pro':'year_pro', 'pl1_weight':'weight', 'pl1_height':'height', 'pl1_hand':'hand'}), 
                               df[['Loser', 'pl2_flag', 'pl2_year_pro', 'pl2_weight', 'pl2_height', 'pl2_hand']]\
                               .rename(columns={'Loser':'Player', 'pl2_flag':'flag', 'pl2_year_pro':'year_pro', 'pl2_weight':'weight', 'pl2_height':'height', 'pl2_hand':'hand'})])\
                    .drop_duplicates(subset=['Player']).reset_index(drop=True), 
                    left_on=['Player'], right_on=['Player'], how='outer')

In [90]:
# Convert nb_won, nb_lost, nb_matches, nbtot_matches and year_pro to int, and Year to string
df_nodes = df_nodes.astype({'nb_won': 'int', 'nb_lost': 'int', 'nb_matches': 'int', 'nbtot_matches': 'int', 'year_pro': 'int', 'Year': 'str'})

In [91]:
# Add a column with dictionnary of Year:fraction_matches to df_nodes
df_nodes = pd.merge(df_nodes,
                    pd.DataFrame(df_nodes.groupby('Player')[['Year', 'fraction_matches']].apply(lambda x: pd.Series(x['fraction_matches'].values,index=x['Year']).to_dict()))\
                    .rename(columns={0:'dict_year_fract'}), 
                    left_on=['Player'], right_on=['Player'], how='outer')

In [92]:
# Complete Year:fraction_matches with the missed years (fraction_matches == 0)

def add_missed_years(d):
    for year in df.Year.unique():
        y = str(year)
        if y not in d.keys():
            d[y] = 0
    return d

df_nodes.dict_year_fract = df_nodes.dict_year_fract.apply(add_missed_years)            

In [93]:
# Put into dict format for network construction
dict_nodes = df_nodes[['Player', 'flag', 'year_pro', 'weight', 'height', 'hand', 'dict_year_fract']].drop_duplicates(subset=['Player']).to_dict('records')

In [94]:
# Visualize
df_nodes

Unnamed: 0,Player,Year,nb_won,nb_lost,nb_matches,nbtot_matches,fraction_matches,flag,year_pro,weight,height,hand,dict_year_fract
0,Acasuso J.,2008,26,22,48,5084,0.094414,ARG,1999,86.0,191.0,Right-Handed,"{'2008': 0.0944138473642801, '2009': 0.0783339..."
1,Acasuso J.,2009,20,21,41,5234,0.078334,ARG,1999,86.0,191.0,Right-Handed,"{'2008': 0.0944138473642801, '2009': 0.0783339..."
2,Acasuso J.,2011,2,1,3,5270,0.005693,ARG,1999,86.0,191.0,Right-Handed,"{'2008': 0.0944138473642801, '2009': 0.0783339..."
3,Acasuso J.,2010,0,3,3,5260,0.005703,ARG,1999,86.0,191.0,Right-Handed,"{'2008': 0.0944138473642801, '2009': 0.0783339..."
4,Acasuso J.,2007,48,47,95,70786,0.013421,ARG,1999,86.0,191.0,Right-Handed,"{'2008': 0.0944138473642801, '2009': 0.0783339..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,Langer N.,2007,0,2,2,70786,0.000283,GER,2005,77.0,193.0,Right-Handed,"{'2013': 0.00392156862745098, '2007': 0.000282..."
4596,Salamanca C.,2013,0,1,1,5100,0.001961,COL,2001,83.0,196.0,Left-Handed,"{'2013': 0.00196078431372549, '2007': 0.000141..."
4597,Salamanca C.,2007,0,1,1,70786,0.000141,COL,2001,83.0,196.0,Left-Handed,"{'2013': 0.00196078431372549, '2007': 0.000141..."
4598,Sekulic M.,2013,0,1,1,5100,0.001961,SWE,2008,73.0,178.0,Right-Handed,"{'2013': 0.00196078431372549, '2007': 0.000141..."


In [95]:
# Visualize
dict_nodes[0:3]

[{'Player': 'Acasuso J.',
  'flag': 'ARG',
  'year_pro': 1999,
  'weight': 86.0,
  'height': 191.0,
  'hand': 'Right-Handed',
  'dict_year_fract': {'2008': 0.0944138473642801,
   '2009': 0.07833397019487964,
   '2011': 0.0056925996204933585,
   '2010': 0.005703422053231939,
   '2007': 0.013420732913287938,
   '2022': 0,
   '2021': 0,
   '2020': 0,
   '2018': 0,
   '2019': 0,
   '2017': 0,
   '2016': 0,
   '2015': 0,
   '2013': 0,
   '2014': 0,
   '2012': 0}},
 {'Player': 'Almagro N.',
  'flag': 'ESP',
  'year_pro': 2003,
  'weight': 86.0,
  'height': 183.0,
  'hand': 'Right-Handed',
  'dict_year_fract': {'2008': 0.09638080251770259,
   '2009': 0.10126098586167366,
   '2011': 0.13092979127134724,
   '2017': 0.04230769230769231,
   '2014': 0.04944620253164557,
   '2015': 0.07318952234206472,
   '2016': 0.08301158301158301,
   '2018': 0.0019004180919802356,
   '2010': 0.12167300380228135,
   '2012': 0.14404852160727824,
   '2013': 0.11764705882352941,
   '2007': 0.07063543638572599,
   '2

## Construction of the dictionnary for LINKS

In [96]:
# Dataframe with player 1 vs. player 2 and characteristics of the matches
df_links_all = pd.DataFrame(df.groupby(['Winner', 'Loser'])[['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'WRank', 'LRank', 'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'Year']]\
                        .agg(list)).reset_index()
df_links_all.rename(columns={'Winner':'pl1', 'Loser':'pl2'}, inplace=True)
df_links_all['Winner'] = df_links_all.apply(lambda x: [x['pl1'] for i in x['ATP']], axis=1)

# Group in unique combinations of players
pairs = [tuple(item) for item in df_links_all[['pl1', 'pl2']].values.tolist()]
unique_pairs = list(set(tuple(sorted(l)) for l in pairs))
nonunique_pairs = list((Counter([tuple(sorted(l)) for l in pairs])-Counter(unique_pairs)).elements())
for nup in nonunique_pairs:
    for c in df_links_all.columns.drop(labels=['pl1','pl2']):
        ind = df_links_all[(df_links_all.pl1 == nup[0]) & (df_links_all.pl2 == nup[1])].index.values[0]
        df_links_all.at[ind, c] = list(list(itertools.chain(*list(df_links_all.loc[(df_links_all.pl1 == nup[0]) & (df_links_all.pl2 == nup[1]), c].append(df_links_all.loc[(df_links_all.pl1 == nup[1]) & (df_links_all.pl2 == nup[0]), c])))))
    ind_drop = df_links_all[(df_links_all.pl1 == nup[1]) & (df_links_all.pl2 == nup[0])].index.values[0]
    df_links_all = df_links_all.drop(ind_drop)

# Add column year_selected
df_links_all['year_selected'] = 2007 # /!\ 2007 is used instead of 'All' to avoid to have string and int

In [97]:
# Same operations but for each year separatly

def df_links_oneyear(year):
    
    df_links_year = pd.DataFrame(df[df.Year == year].groupby(['Winner', 'Loser'])[['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'WRank', 'LRank', 'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'Year']]\
                        .agg(list)).reset_index()
    df_links_year.rename(columns={'Winner':'pl1', 'Loser':'pl2'}, inplace=True)
    df_links_year['Winner'] = df_links_year.apply(lambda x: [x['pl1'] for i in x['ATP']], axis=1)

    pairs = [tuple(item) for item in df_links_year[['pl1', 'pl2']].values.tolist()]
    unique_pairs = list(set(tuple(sorted(l)) for l in pairs))
    nonunique_pairs = list((Counter([tuple(sorted(l)) for l in pairs])-Counter(unique_pairs)).elements())
    for nup in nonunique_pairs:
        for c in df_links_year.columns.drop(labels=['pl1','pl2']):
            ind = df_links_year[(df_links_year.pl1 == nup[0]) & (df_links_year.pl2 == nup[1])].index.values[0]
            df_links_year.at[ind, c] = list(list(itertools.chain(*list(df_links_year.loc[(df_links_year.pl1 == nup[0]) & (df_links_year.pl2 == nup[1]), c].append(df_links_year.loc[(df_links_year.pl1 == nup[1]) & (df_links_year.pl2 == nup[0]), c])))))
        ind_drop = df_links_year[(df_links_year.pl1 == nup[1]) & (df_links_year.pl2 == nup[0])].index.values[0]
        df_links_year = df_links_year.drop(ind_drop)
    
    df_links_year['year_selected'] = year

    return df_links_year
    

In [98]:
# Combine df_links_all and all the dataframe df_links_year
df_links = df_links_all.copy()
for year in df.Year.unique():
    df_links_year = df_links_oneyear(year)
    df_links = pd.concat([df_links, df_links_year])

In [99]:
# Add column with number of matches between the two players
df_links['nb_matches'] = df_links.ATP.apply(lambda x : len(list(x)))

In [100]:
# Add combinations of pl1/pl2/year that do not exist and put the values 0 or None

exist_comb = [tuple(item) for item in df_links[['pl1', 'pl2', 'year_selected']].astype({"year_selected": str}).values.tolist()]

players_comb = list(itertools.combinations(df_nodes.Player.unique(), 2))
theo_comb = []
for t in players_comb:
    for year in df_links.year_selected.unique():
        theo_comb.append(t+(str(year),))
        
absent_comb = list((Counter([tuple(sorted(l)) for l in exist_comb])-Counter([tuple(sorted(l)) for l in theo_comb])).elements())

df_absent_comb = pd.DataFrame(absent_comb)
df_absent_comb.rename(columns={1: "pl1", 2: "pl2", 0:"year_selected"}, inplace=True)

df_absent_comb['year_selected'].astype('int64')

for c in ['ATP', 'Location', 'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'WRank', 'LRank', 'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'Year', 'Winner']:
    df_absent_comb[c] = 'None'
    df_absent_comb[c] = df_absent_comb[c].apply(lambda x: [x])

df_absent_comb['nb_matches'] = 0

df_links = pd.concat([df_links, df_absent_comb])

In [101]:
# Save into a csv
df_links.to_csv('data/df_links.csv')

In [102]:
# Open df_links
df_links = pd.read_csv("data/df_links.csv", index_col=0)

In [103]:
# Convert Date column cells to list of datetime
def Timestamp(timestr): 
    return datetime.strptime(timestr,'%Y-%m-%d %H:%M:%S')
df_links['Date'] = df_links['Date'].apply(lambda x: eval(x))

In [104]:
# Convert column cells to list
for col in ['ATP', 'Location', 'Tournament', 'Series', 'Court', 'Surface', 'Round', 'Best of', 'WRank', 'LRank', 'WPts', 'LPts', 'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 'Wsets', 'Lsets', 'Comment', 'B365W', 'B365L', 'Year', 'Winner']:
    df_links[col] = df_links[col].apply(lambda x: x.replace('nan', 'None'))
    df_links[col] = df_links[col].apply(ast.literal_eval)

In [105]:
# Visualize
df_links

Unnamed: 0,pl1,pl2,ATP,Location,Tournament,Date,Series,Court,Surface,Round,...,L5,Wsets,Lsets,Comment,B365W,B365L,Year,Winner,year_selected,nb_matches
0,Acasuso J.,Andreev I.,[11],[Buenos Aires],[Copa Telmex],[2008-02-22 00:00:00],[International],[Outdoor],[Clay],[Quarterfinals],...,[None],[2.0],[1.0],[Completed],[1.61],[2.2],[2008],[Acasuso J.],2007,1
1,Acasuso J.,Bachinger M.,[58],[Stockholm],[Stockholm Open],[2008-10-07 00:00:00],[International],[Indoor],[Hard],[1st Round],...,[None],[2.0],[1.0],[Completed],[1.36],[3.0],[2008],[Acasuso J.],2007,1
2,Acasuso J.,Bellucci T.,[8],[Vina del Mar],[Movistar Open],[2009-02-06 00:00:00],[ATP250],[Outdoor],[Clay],[2nd Round],...,[None],[2.0],[0.0],[Completed],[2.37],[1.53],[2009],[Acasuso J.],2007,1
3,Acasuso J.,Berrer M.,[51],[New York],[US Open],[2008-08-25 00:00:00],[Grand Slam],[Outdoor],[Hard],[1st Round],...,[None],[3.0],[0.0],[Completed],[1.61],[2.2],[2008],[Acasuso J.],2007,1
4,Acasuso J.,Canas G.,"[14, 55]","[Acapulco, Metz]","[Abierto Mexicano, Open de Moselle]","[2008-02-27 00:00:00, 2008-09-29 00:00:00]","[International Gold, International]","[Outdoor, Indoor]","[Clay, Hard]","[2nd Round, 1st Round]",...,"[None, None]","[2.0, 2.0]","[1.0, 1.0]","[Completed, Completed]","[1.8, 2.0]","[1.9, 1.72]","[2008, 2008]","[Acasuso J., Acasuso J.]",2007,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,Santoro F.,Seppi A.,[None],[None],[None],[None],[None],[None],[None],[None],...,[None],[None],[None],[None],[None],[None],[None],[None],2007,0
12,Sirianni J.,Stepanek R.,[None],[None],[None],[None],[None],[None],[None],[None],...,[None],[None],[None],[None],[None],[None],[None],[None],2007,0
13,Hartfield D.,Volandri F.,[None],[None],[None],[None],[None],[None],[None],[None],...,[None],[None],[None],[None],[None],[None],[None],[None],2007,0
14,Vanek J.,Youzhny M.,[None],[None],[None],[None],[None],[None],[None],[None],...,[None],[None],[None],[None],[None],[None],[None],[None],2007,0


In [106]:
# Put into dict format for network construction
dict_links = df_links.to_dict('records')

In [107]:
# Visualize
dict_links[10:13]

[{'pl1': 'Acasuso J.',
  'pl2': 'Decoud S.',
  'ATP': [8, 8],
  'Location': ['Vina del Mar', 'Costa Do Sauipe'],
  'Tournament': ['Movistar Open', 'Brasil Open'],
  'Date': [datetime.datetime(2009, 2, 6, 0, 0),
   datetime.datetime(2008, 2, 12, 0, 0)],
  'Series': ['ATP250', 'International'],
  'Court': ['Outdoor', 'Outdoor'],
  'Surface': ['Clay', 'Clay'],
  'Round': ['Quarterfinals', '1st Round'],
  'Best of': [3, 3],
  'WRank': [46.0, 283.0],
  'LRank': [173.0, 50.0],
  'WPts': [1446.0, 134.0],
  'LPts': [472.0, 621.0],
  'W1': [6.0, 6.0],
  'L1': [4.0, 3.0],
  'W2': [6.0, 1.0],
  'L2': [2.0, 6.0],
  'W3': [None, 7.0],
  'L3': [None, 6.0],
  'W4': [None, None],
  'L4': [None, None],
  'W5': [None, None],
  'L5': [None, None],
  'Wsets': [2.0, 2.0],
  'Lsets': [0.0, 1.0],
  'Comment': ['Completed', 'Completed'],
  'B365W': [1.28, 6.5],
  'B365L': [3.5, 1.1],
  'Year': [2009, 2008],
  'Winner': ['Acasuso J.', 'Decoud S.'],
  'year_selected': 2007,
  'nb_matches': 2},
 {'pl1': 'Acasuso

## Construction of the final Json file with nodes and links dictionaries

In [108]:
# Create final dictionary with nodes and links
dict_final = {'nodes': dict_nodes, 'links': dict_links}

In [109]:
# Save dict_final into json file

def json_serial(obj):
    """JSON serializer for objects not serializable by default json code"""
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    raise TypeError ("Type %s not serializable" % type(obj))

with open('data/tab1-network_data.json', 'w') as fp:
    json.dump(dict_final, fp, default=json_serial)