In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC, SVR

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Read data

In [147]:
with open('data/consecutive_games_df.pkl', 'rb') as handle:
    full_df = pickle.load(handle)

In [148]:
MODEL = LinearRegression
CLF_MODEL = LogisticRegression

# Propensity and IPW
We'll use Logistic Regression

In [149]:
full_df['time_between_games'] = (full_df['next_date'] - full_df['prev_date']).apply(lambda x: x.days)

In [150]:
full_df['lost_prev_game'] = full_df['prev_goal_diff'] < 0

In [289]:
drop_cols = ['row_team', 'league_id', 'prev_match_api_id', 'prev_date', 'prev_rival_team_api_id', 'next_match_api_id', 
             'next_date', 'next_rival_team_api_id', 'next_rival_team_goal', 'prev_goal_diff',
             'next_row_team_goal', 'prev_row_team_goal', 'prev_rival_team_goal']
bet_cols = [col for col in full_df.columns if '_bet_' in col]
T = 'lost_prev_game'
Y = 'next_mean_bet_row_team'
# T = 'prev_home'
# Y = 'prev_goal_diff'
CATEGORICAL_COLS = ['season']
X_COLS = [col for col in full_df.columns if col not in drop_cols + bet_cols + [Y, T] + CATEGORICAL_COLS]

In [290]:
X_COLS

['row_team_wins',
 'row_team_draws',
 'row_team_league_points',
 'row_team_season_goals',
 'prev_home',
 'prev_row_team_min_squad_rating',
 'prev_rival_team_min_squad_rating',
 'prev_row_team_max_squad_rating',
 'prev_rival_team_max_squad_rating',
 'prev_row_team_mean_squad_rating',
 'prev_rival_team_mean_squad_rating',
 'prev_row_team_std_squad_rating',
 'prev_rival_team_std_squad_rating',
 'prev_row_team_median_squad_rating',
 'prev_rival_team_median_squad_rating',
 'prev_rival_wins',
 'prev_rival_draws',
 'prev_rival_league_points',
 'prev_rival_season_goals',
 'next_home',
 'next_goal_diff',
 'next_row_team_min_squad_rating',
 'next_rival_team_min_squad_rating',
 'next_row_team_max_squad_rating',
 'next_rival_team_max_squad_rating',
 'next_row_team_mean_squad_rating',
 'next_rival_team_mean_squad_rating',
 'next_row_team_std_squad_rating',
 'next_rival_team_std_squad_rating',
 'next_row_team_median_squad_rating',
 'next_rival_team_median_squad_rating',
 'next_rival_wins',
 'next_ri

In [291]:
df = full_df.drop(columns=drop_cols)

In [292]:
def prepare_data(data):
    df_categorical = data[CATEGORICAL_COLS]
    dummies = pd.get_dummies(df_categorical)
    data[dummies.columns] = dummies
    return data.drop(columns=df_categorical.columns)

In [293]:
def propensity(data, model=CLF_MODEL):
    # Preprocess
    # Use standard scaling
    # Apply Logistic Regression on X, predict t
    # Return model's probabilities
    data = data.copy()
    data = prepare_data(data)
    
    X = data[X_COLS]
    t = data[T]
    
    steps = [('scaler', StandardScaler()),
             ('clf', model(random_state=0))]
    pipe_lr = Pipeline(steps)
    clf = pipe_lr.fit(X, t)
    
    return clf.predict_proba(X)[:,1]

### Propensity

In [294]:
prop = propensity(df)
print(f'Data Propensity: \n{prop}')

Data Propensity: 
[0.17743464 0.56997592 0.24451984 ... 0.2134336  0.22166519 0.50090744]


In [80]:
# pd.DataFrame([prop1, prop2], index=['data1', 'data2']).to_csv('models_propensity.csv', header=False)

### IPW

In [295]:
def att(data, prop):
    # Calculate IPW formula (tutorial 8)
    left = ((data[T] * data[Y]).sum() / data[T].sum())
    up = (((1 - data[T]) * data[Y]) * (prop / (1 - prop))).sum()
    down = ((1 - data[T]) * (prop / (1 - prop))).sum()
    return left - (up / down)

In [296]:
def ipw(data, model=CLF_MODEL):
    prop = propensity(data, model)
    att_hat_ipw = att(data, prop)
    return att_hat_ipw

In [297]:
att_hat_ipw = ipw(df)
print(f'{att_hat_ipw = }')

att_hat_ipw = 0.11936874037251632


# S-Learner

In [298]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures

class InteractionsNoT(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.poly = PolynomialFeatures(interaction_only=True,include_bias = False)
        
    def fit(self, X, y):
        dummies_columns = CATEGORICAL_COLS
        self.poly.fit(X.drop(columns=[T] + dummies_columns), y)  # Perform interactions on X without T
        return self
        
    def transform(self, X):
        X = X.copy()
        dummies_columns = CATEGORICAL_COLS
        dummies_values = X[dummies_columns]
        t = X[T]
        X = self.poly.transform(X.drop(columns=[T] + dummies_columns))
        X = np.column_stack((X, dummies_values, t))
        return X

In [299]:
def s_learner_model(data, model=MODEL):
    # Preprocess
    # Use standard scaling
    # Apply Linear Regression on X with t, predict y
    # Return model
    data = data.copy()
    data = prepare_data(data)
    X = data.drop(columns=[Y])
    y = data[Y]

    steps = [('scaler', StandardScaler()),
             ('clf', model())]
    pipe_lr = Pipeline(steps)
    clf = pipe_lr.fit(X, y)
    
    return clf

In [300]:
def s_learner_model_with_interactions(data, model=MODEL):
    # Preprocess
    # Use standard scaling
    # Apply Linear Regression on X (+interactions) with t, predict y
    # Return model
    data = data.copy()
    data = prepare_data(data)
    X = data.drop(columns=[Y])
    y = data[Y]

    steps = [('interactions', InteractionsNoT()),
             ('scaler', StandardScaler()),
             ('clf', model())]
    pipe_lr = Pipeline(steps)
    clf = pipe_lr.fit(X, y)
    
    return clf

In [301]:
def calc_att_s_learner(data, clf):
    # Use S-learner (tutorial 8)
    data = data.copy()
    data = prepare_data(data).drop(columns=[Y])
    f1 = data[data[T]==1]
    f0 = data[data[T]==1].copy()
    f0[T] = 0
    
    return (clf.predict(f1) - clf.predict(f0)).mean()

In [302]:
def s_learner(data, use_interactions=False, model=MODEL):
    # Call S-learner with or without interactions
    clf = s_learner_model(data, model=model) if not use_interactions else s_learner_model_with_interactions(data, model=model)
    return calc_att_s_learner(data, clf)

### No interactions:

In [303]:
att_hat_s_learner = s_learner(df)
print(f'{att_hat_s_learner = }')

att_hat_s_learner = 0.025316439531624615


### With interactions:

In [48]:
att_hat_s_learner_inters = s_learner(df, True)
print(f'{att_hat_s_learner_inters = }')

KeyError: "['season'] not found in axis"

# T-Learner

In [304]:
def t_learner_model(data, t, model=MODEL):
    # Preprocess and take only data with treatment t
    # Use standard scaling
    # Apply Linear Regression on X with t, predict y  # TODO: with T or without T in moedel?
    # Return model
    data = data.copy()
    data = prepare_data(data)
    data = data[data[T] == t]
    X = data.drop(columns=[Y, T])
    y = data[Y]

    steps = [('scaler', StandardScaler()),
             ('clf', model())]
    pipe_lr = Pipeline(steps)
    clf = pipe_lr.fit(X, y)
    
    return clf

In [305]:
def calc_att_t_learner(data, clf0, clf1):
    # Use T-learner (tutorial 8)
    data = data.copy()
    data = prepare_data(data)
    d_treated = data[data[T] == 1].drop(columns=[Y, T])
    
    return (clf1.predict(d_treated) - clf0.predict(d_treated)).mean()

In [306]:
def t_learner(data, model=MODEL):
    # Call S-learner with or without interactions
    clf0 = t_learner_model(data, 0, model=model)
    clf1 = t_learner_model(data, 1, model=model)
    return calc_att_t_learner(data, clf0, clf1)

In [307]:
att_hat_t_learner = t_learner(df)
print(f'{att_hat_t_learner = }')

att_hat_t_learner = 0.03292420809889714


# Matching

In [119]:
from scipy.spatial.distance import cdist

In [308]:
def matching_1NN(data):
    # Preprocess
    # Prepare data pairs
    # Calculate ITEs (lesson 3)
    # Calculate Average
    data = prepare_data(data)
    
    data_T0 = data[data[T] == 0]
    data_T0 = data_T0.drop(columns=[T])
    data_T1 = data[data[T] == 1]
    data_T1 = data_T1.drop(columns=[T])
    
    ITEs = []
    dists = cdist(np.array(data_T1.drop(columns=[Y])).astype(float), np.array(data_T0.drop(columns=[Y])).astype(float))
    for i, j in enumerate(dists.argmin(axis=1)):
        ITE_i = data_T1[Y].iloc[i] - data_T0[Y].iloc[j]
        ITEs.append(ITE_i)
    
    return np.mean(ITEs)

In [70]:
att_hat_matching_1NN = matching_1NN(df)
print(f'{att_hat_matching_1NN = }')

TypeError: numpy boolean subtract, the `-` operator, is not supported, use the bitwise_xor, the `^` operator, or the logical_xor function instead.

In [309]:
def matching_general(data, k=10):
    # Preprocess
    # Prepare data near groups
    # Calculate ITEs (lesson 3)
    # Calculate Average
    data = prepare_data(data)
    
    data_T0 = data[data[T] == 0]
    data_T0 = data_T0.drop(columns=[T])
    data_T1 = data[data[T] == 1]
    data_T1 = data_T1.drop(columns=[T])
    
    ITEs = []
    i_dict = {}  # J(i)
    dists = cdist(np.array(data_T1.drop(columns=[Y])).astype(float), np.array(data_T0.drop(columns=[Y])).astype(float))
    k_nearest = np.argpartition(dists, k)[:,:k]
    for i, close_inds in enumerate(k_nearest):
        ITE_i = data_T1[Y].iloc[i] - data_T0[Y].iloc[close_inds].mean()
        ITEs.append(ITE_i)
    
    return np.mean(ITEs)

In [310]:
att_hat_matching_general = matching_general(df)
print(f'{att_hat_matching_general = }')

att_hat_matching_general = -0.03810090104517261


# Competition

In [100]:
scores1, scores2 = [], []

for clf_model, lin_model in zip([LogisticRegression, RandomForestClassifier, DecisionTreeClassifier], 
                                [LinearRegression, RandomForestRegressor, DecisionTreeRegressor]):
    print(f'Models: \n{clf_model, lin_model}')
    print('ipw')
    ipw1 = ipw(df, model=clf_model)
    
    print('s')
    s1 = s_learner(df, model=lin_model)
    
    print('t')
    t1 = t_learner(df, model=lin_model)
    
    scores1 += [ipw1, s1, t1]

scores1 += [att_hat_matching_1NN, att_hat_matching_general]

att1_hat_comp = np.nanmean(scores1)

Models: 
(<class 'sklearn.linear_model._logistic.LogisticRegression'>, <class 'sklearn.linear_model._base.LinearRegression'>)
ipw
s
t
Models: 
(<class 'sklearn.ensemble._forest.RandomForestClassifier'>, <class 'sklearn.ensemble._forest.RandomForestRegressor'>)
ipw


  up = (((1 - data[T]) * data[Y]) * (prop / (1 - prop))).sum()
  down = ((1 - data[T]) * (prop / (1 - prop))).sum()


s
t
Models: 
(<class 'sklearn.tree._classes.DecisionTreeClassifier'>, <class 'sklearn.tree._classes.DecisionTreeRegressor'>)
ipw
s
t


  up = (((1 - data[T]) * data[Y]) * (prop / (1 - prop))).sum()
  down = ((1 - data[T]) * (prop / (1 - prop))).sum()


In [101]:
att1_hat_comp

-0.021499890800913697

## Save all

In [28]:
final = pd.DataFrame([
    [att1_hat_ipw, att2_hat_ipw], 
    [att1_hat_s_learner , att2_hat_s_learner], 
    [att1_hat_t_learner , att2_hat_t_learner], 
    [att1_hat_matching_general, att2_hat_matching_general],
    [att1_hat_comp, att2_hat_comp]
], 
columns = ['data1', 'data2'])
final['Type'] = np.arange(1, len(final) + 1)
final[['Type', 'data1', 'data2']].to_csv('ATT_results.csv', index=None)

In [29]:
from zipfile import ZipFile

In [30]:
# create a ZipFile object
zipObj = ZipFile('205460686_316361641.zip', 'w')

# Add multiple files to the zip
zipObj.write('models_propensity.csv')
zipObj.write('ATT_results.csv')

# close the Zip File
zipObj.close()