In [1]:
import numpy as np
import pandas as pd

In [2]:
data_types = {
    'ATP': int,
    'Location': object,
    'Tournament': object,
    'Series': object,
    'Court': object,
    'Surface': object,
    'Round': object,
    'Best of': int,
    'Winner': object,
    'Loser': object,
    'WRank': float,
    'LRank': float,
    'WPts': float,
    'LPts': float,
    'W1': float,
    'L1': float,
    'W2': float,
    'L2': float,
    'W3': float,
    'L3': float,
    'W4': float,
    'L4': float,
    'W5': float,
    'L5': float,
    'Wsets': float,
    'Lsets': float,
    'Comment': object,
    'B365W': float,
    'B365L': float,
    'EXW': object,
    'EXL': float,
    'LBW': float,
    'LBL': float,
    'PSW': float,
    'PSL': float,
    'SJW': float,
    'SJL': float,
    'MaxW': float,
    'MaxL': float,
    'AvgW': float,
    'AvgL': float,
    'WElo': float,
    'WSurfElo': float,
    'WHand': object,
    'WBHand': float,
    'LElo': float,
    'LSurfElo': float,
    'LHand': object,
    'LBHand': float
}
torneo = pd.read_csv('data/prova.csv', 
                            encoding='utf-8-sig', 
                            dtype=data_types,
                            parse_dates=['Date', 'WBD', 'LBD'])
torneo

Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,WElo,WSurfElo,WBD,WHand,WBHand,LElo,LSurfElo,LBD,LHand,LBHand
0,2,Doha,Qatar Exxon Mobil Open,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Herbert P.H.,...,,,1991-03-18,R,2.0,2066.2,2003.9,1993-09-03,R,1.0
1,2,Doha,Qatar Exxon Mobil Open,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Fucsovics M.,...,1747.4,1712.9,1992-02-08,R,2.0,1668.8,1639.6,1990-10-17,R,1.0
2,2,Doha,Qatar Exxon Mobil Open,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Djokovic N.,...,2201.3,2142.4,1987-05-22,R,2.0,1723.9,1682.0,1992-05-20,R,2.0
3,2,Doha,Qatar Exxon Mobil Open,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Cecchinato M.,...,1626.7,1496.9,1992-09-30,R,1.0,1537.7,1481.7,1986-01-06,R,1.0
4,2,Doha,Qatar Exxon Mobil Open,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Lajovic D.,...,1716.2,1649.5,1990-06-30,R,1.0,1782.0,1727.6,1988-06-29,L,2.0
5,2,Doha,Qatar Exxon Mobil Open,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Garcia-Lopez G.,...,,,1983-06-04,R,1.0,,,1995-04-14,R,2.0
6,2,Doha,Qatar Exxon Mobil Open,2019-01-01,ATP250,Outdoor,Hard,1st Round,3,Wawrinka S.,...,1920.7,1884.8,1985-03-28,R,1.0,1872.7,1817.9,1996-05-21,R,2.0


In [3]:
def simulation(atp = 1, 
               min_date=2011,
               max_date=2019,
               features_to_drop=[], 
               missing_values="drop", 
               drop_first=False):
    
    tournament = pd.read_csv("data/" + str(max_date+1) + ".csv", encoding='utf-8-sig', dtype=data_types)
    tournament = tournament[tournament['ATP'] == atp]
    # TODO filtrare per atp e round(primo)
    tournament = unify_data(tournament, features_to_drop, 'custom', drop_first)
    
    X, Y = preprocess_data(min_date, max_date, features_to_drop, missing_values, drop_first)
    
    # TODO allenare e simulare su tournament
    return tournament, X, Y
    

In [4]:
def preprocess_data(min_date=2011,
                    max_date=2019,
                    features_to_drop=[], 
                    missing_values="drop", 
                    drop_first=False):
    """
    Processes raw data and returns a tuple (X, Y) where X is the cleaned dataset and Y is the array of labels.
    """
    # Loads data for the given years
    if max_date > 2019 or min_date < 2011:
        raise ValueError("Wrong date parameter")
        
    df = pd.read_csv("data/" + str(min_date) + ".csv", encoding='utf-8-sig', dtype=data_types)
    for year in range (min_date + 1, max_date + 1):
        filename = "data/" + str(year) + ".csv"
        df = pd.concat((df, pd.read_csv(filename, encoding='utf-8-sig', dtype=data_types)))
        
    X = unify_data(df, features_to_drop, missing_values, drop_first)
        
    # Generate labels
    Y = np.concatenate([np.ones(X.shape[0], dtype=int), np.zeros(X.shape[0], dtype=int)])
    
    # Duplicate data with swapped columns
    tmp = X.copy()
    cols_to_swap = ['P1Rank', 'P2Rank', 'MaxP1', 'MaxP2',  'AvgP1',  'AvgP2', 'P1Pts', 'P2Pts',
                    'P1Elo', 'P2Elo', 'P1SurfElo', 'P2SurfElo', 'P1Hand', 'P2Hand', 'P1BHand', 'P2BHand']
    cols_to_swap = [f for f in cols_to_swap if f not in features_to_drop]
    cols_swapped = ['P2Rank', 'P1Rank', 'MaxP2', 'MaxP1',  'AvgP2',  'AvgP1', 'P2Pts', 'P1Pts',
                    'P2Elo', 'P1Elo', 'P2SurfElo', 'P1SurfElo', 'P2Hand', 'P1Hand', 'P2BHand', 'P1BHand']
    cols_swapped = [f for f in cols_swapped if f not in features_to_drop]
    
    tmp[cols_to_swap] = tmp[cols_swapped]
    tmp.index = np.array(range(X.shape[0] + 1, X.shape[0] * 2 + 1))
    X = pd.concat((X, tmp))
    
    return X, Y

In [5]:
def unify_data(df,
               features_to_drop=[], 
               missing_values="drop", 
               drop_first=False):
    
    # Sort by date to calculate ELO
    X = df.sort_values(by='Date')
    
    # Drop unuseful columns
    features_to_drop += ['ATP', 'Location', 'Tournament', 'Date', 'Comment', 
                         'Winner', 'Loser', 'Wsets', 'Lsets', 
                         'W1', 'L1', 'W2', 'L2', 'W3', 'L3', 'W4', 'L4', 'W5', 'L5', 
                         'B365W', 'B365L', 'EXW', 'EXL', 'LBW', 'LBL', 'PSW', 'PSL', 'SJW', 'SJL',
                         'WBD', 'LBD']
    X = X.drop(columns=features_to_drop)
    
    # Deal with missing values
    X['WRank'] = X['WRank'].fillna(value=X['WRank'].max()+100).astype(int)
    X['LRank'] = X['LRank'].fillna(value=X['LRank'].max()+100).astype(int)

    if missing_values == 'drop':
        X = X.dropna()
    elif missing_values == 'custom':
        pass
    else:
        raise ValueError('Wrong parameter: missing_values')

    # Convert ordinal features to int (higher value means more important)
    series = ['ATP250', 'ATP500', 'Masters 1000', 'Masters Cup', 'Grand Slam']
    series2int = {s: i for i, s in enumerate(series)}
    rounds2int = {'1st Round': 0,
                  '2nd Round': 1,
                  '3rd Round': 2,
                  '4th Round': 3,
                  'Round Robin': 4,
                  'Quarterfinals': 5,
                  'Semifinals': 6,
                  'The Final': 7,
                 }
    X = X.replace({'Round': rounds2int, 'Series': series2int})
    
    # Convert categorical (binary) fields to int
    X = X.replace({'Court': {'Outdoor': 0, 'Indoor': 1}, 
                   'WHand': {'R': 0, 'L': 1}, 
                   'LHand': {'R': 0, 'L': 1}})
    X = X.astype({'Series' : int, 'Court' : int, 'Round' : int, 'WBHand': int, 'LBHand': int})
    
    # One hot encode categorical features into binary features
    X = pd.get_dummies(X, prefix=['Surface_'], columns=['Surface'], drop_first=drop_first)
    
    # Rename columns
    X = X.rename(columns={'WRank':'P1Rank', 'LRank':'P2Rank', 
                          'MaxW':'MaxP1', 'MaxL':'MaxP2', 
                          'AvgW':'AvgP1', 'AvgL':'AvgP2',
                          'WPts':'P1Pts', 'LPts':'P2Pts',
                          'WElo':'P1Elo', 'LElo':'P2Elo',
                          'WSurfElo':'P1SurfElo', 'LSurfElo':'P2SurfElo',
                          'WHand':'P1Hand', 'LHand':'P2Hand',
                          'WBHand':'P1BHand', 'LBHand':'P2BHand'})
    
    # Generate new columns
    X['GreaterRank'] = (X['P1Rank'] < X['P2Rank']).astype(int)
    
    return X

In [6]:
T, X, Y = simulation(atp = 2, min_date=2014, max_date=2018)

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
T