In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
import numpy as np
from sklearn.decomposition import PCA

from tqdm.notebook import tqdm
from sklearn.metrics import r2_score

from os import listdir
from re import findall

In [2]:
train = pd.read_csv("train.zip")
test = pd.read_csv("test.zip")
test_init = test[['row_id']].copy()

def check_test(test):
    return len(test[['row_id']].merge(test_init, on = 'row_id')) == len(test_init) and \
           len(test) == len(test_init)

winner_dict = {'winner' : 3, 'draw' : 1, 'loser' : 0}

get_dict = lambda df, team, ren_team: dict([x, x.replace(team, ren_team)] for x in df.columns if team in x)

def preprocess_df(df):

    df['points'] = df['winner'].apply(lambda x: winner_dict[x]).astype('uint8')
    df['team'] = df['team'].fillna("team1")
    df['is_home'] = (df['team'] == 'team1').astype('uint8')
    df['player_bmi'] = df['player_weight'] / (df['player_weight'] ** 2)
    df['changed_position'] = (df['player_position_1'] != df['player_position_2']).astype('uint8')
    df_fillna = df.median()
    df = df.fillna(df_fillna)
    
    for c in [x for x in df.columns if 'team1' in x and 'system_id' not in x]:
        df[c.replace("team1", "diff")] = df[c] - df[c.replace('team1', 'team2')]
    
    return df

full_df = train.drop(columns = ['rating_num']).append(test, ignore_index = True)

full_df = preprocess_df(full_df)
check_test(test)

True

In [3]:
bad_features = full_df.isna().mean().sort_values() == 1
bad_features = bad_features[bad_features].index.tolist() + ['player_other_ratio_var_7']
permanent_features = (full_df.std() / full_df.median()).abs() < 0.1
permanent_features = permanent_features[permanent_features].index.tolist()

full_df.drop(columns = bad_features + permanent_features, inplace = True)

In [4]:
train = train[['row_id', 'rating_num']].merge(full_df)
test = test[['row_id']].merge(full_df)

check_test(test)

X_train = train.drop(columns = ['rating_num', 'row_id', 'winner', 'team', 'player_position_2']).copy()
y_train = train['rating_num'].copy()

X_test = test.drop(columns = ['row_id', 'winner', 'team', 'player_position_2']).copy()

X_fillna = X_train.median()
X_train = X_train.fillna(X_fillna)
X_test = X_test.fillna(X_fillna)

X_train.shape, X_test.shape

((20453, 918), (8774, 918))

In [None]:
cat_features = ['scout_id', 'competitionId', 'player_position_1', 'team1_system_id', 'team2_system_id', 'is_home']

model = CatBoostRegressor(cat_features=cat_features, n_estimators=20000, random_state=42)
model.fit(X_train, y_train)

In [None]:
subm = test[['row_id']]
subm['rating_num'] = model.predict(X_test)

subm['rating_num'] = (subm['rating_num']).apply(lambda x: min(x, 10))
subm['rating_num'] = (10 * (subm['rating_num'] - subm['rating_num'].min()) /\
                        (subm['rating_num'].max() - subm['rating_num'].min())).round(3)

if check_test(subm):
    subm.to_csv(f"submission_test_36.csv", index = False)

In [5]:
for name in ['player', 'team1', 'team2']:
    cur_feats = [x for x in full_df.columns if name in x and 'raw' in x]
    for attr in set(x.split("_")[1] for x in cur_feats):
        cur_attr_feats = [x for x in cur_feats if attr in x]
        full_df[f"{name}_{attr}_raw_total"] = full_df[cur_attr_feats].sum(axis = 1)

In [6]:
def pca_features(feats, name):
    global full_df
    for c in feats:
        full_df[c] = StandardScaler().fit_transform(full_df[c].values.reshape(-1, 1))
    
    fillna_s = full_df[feats].median()
    to_del_cols = fillna_s[fillna_s.isna()].index.tolist()
    full_df.drop(columns = to_del_cols, inplace = True)
    feats = [x for x in feats if x not in to_del_cols]
    full_df = full_df.fillna(fillna_s)
    pca = PCA().fit(full_df[feats])
    n_components = sum(pca.explained_variance_ratio_.cumsum() <= 0.7)
    new_feats = pca.transform(full_df[feats])[:, :n_components].T
    for i, c in enumerate(new_feats):
        full_df[name+str(i).zfill(2)] = c

pca_features([x for x in full_df.columns if 'player' in x and 'position_' not in x], 'player_feature_')
pca_features([x for x in full_df.columns if 'diff' in x], 'diff_feature_')
pca_features([x for x in full_df.columns if 'team1' in x and 'system_id' not in x], 'team1_feature_')
pca_features([x for x in full_df.columns if 'team2' in x and 'system_id' not in x], 'team2_feature_')

full_df.shape

(29227, 1033)

In [7]:
train = train[['row_id', 'rating_num']].merge(full_df)
test = test[['row_id']].merge(full_df)

X_train = train.drop(columns = ['rating_num', 'row_id', 'winner', 'team', 'player_position_2']).copy()
y_train = train['rating_num'].copy()

X_test = test.drop(columns = ['row_id', 'winner', 'team', 'player_position_2']).copy()

X_fillna = X_train.median()
X_train = X_train.fillna(X_fillna)
X_test = X_test.fillna(X_fillna)

X_train.shape, X_test.shape

((20453, 1029), (8774, 1029))

In [None]:
cat_features = ['scout_id', 'competitionId', 'player_position_1', 'team1_system_id', 'team2_system_id', 'is_home']

model = CatBoostRegressor(cat_features=cat_features, n_estimators=20000, random_state=42)
model.fit(X_train, y_train)

In [None]:
subm = test[['row_id']]
subm['rating_num'] = model.predict(X_test)

subm['rating_num'] = (subm['rating_num']).apply(lambda x: min(x, 10))
subm['rating_num'] = (10 * (subm['rating_num'] - subm['rating_num'].min()) /\
                        (subm['rating_num'].max() - subm['rating_num'].min())).round(3)

if check_test(subm):
    subm.to_csv(f"submission_test_40.csv", index = False)

In [None]:
cat_features = ['scout_id', 'competitionId', 'player_position_1', 'team1_system_id', 'team2_system_id', 'is_home']

model_nd = CatBoostRegressor(cat_features=cat_features, n_estimators=20000, random_state=42, max_depth = 4)
model_nd.fit(X_train, y_train)

In [None]:
subm = test[['row_id']]
subm['rating_num'] = model_nd.predict(X_test)

subm['rating_num'] = (subm['rating_num']).apply(lambda x: min(x, 10))
subm['rating_num'] = (10 * (subm['rating_num'] - subm['rating_num'].min()) /\
                        (subm['rating_num'].max() - subm['rating_num'].min())).round(3)

if check_test(subm):
    subm.to_csv(f"submission_test_51.csv", index = False)

In [None]:
svr = SVR(max_iter = 1000, C = 5, degree = 4).fit(X_train, y_train)

subm = test[['row_id']]
subm['rating_num'] = svr.predict(X_test)

subm['rating_num'] = (subm['rating_num']).apply(lambda x: max(0, min(x, 10)))
subm['rating_num'] = (10 * (subm['rating_num'] - subm['rating_num'].min()) /\
                        (subm['rating_num'].max() - subm['rating_num'].min())).round(3)

if check_test(subm):
    subm.to_csv(f"submission_test_60.csv", index = False)

In [None]:
bayes = BayesianRidge(n_iter=1000, normalize=True)
bayes.fit(X_train, y_train)

subm = test[['row_id']]
subm['rating_num'] = bayes.predict(X_test)

subm['rating_num'] = (subm['rating_num']).apply(lambda x: max(0, min(x, 10)))
subm['rating_num'] = (10 * (subm['rating_num'] - subm['rating_num'].min()) /\
                        (subm['rating_num'].max() - subm['rating_num'].min())).round(3)

if check_test(subm):
    subm.to_csv(f"submission_test_52.csv", index = False)

In [None]:
subm1 = pd.read_csv("submission_test_36.csv").set_index('row_id').rename(columns = {'rating_num' : 'rating1'}) * 0.45
subm2 = pd.read_csv("submission_test_40.csv").set_index('row_id').rename(columns = {'rating_num' : 'rating2'}) * 0.31
subm3 = pd.read_csv("submission_test_51.csv").set_index('row_id').rename(columns = {'rating_num' : 'rating3'}) * 0.08
subm4 = pd.read_csv("submission_test_52.csv").set_index('row_id').rename(columns = {'rating_num' : 'rating4'}) * 0.08
subm5 = pd.read_csv("submission_test_60.csv").set_index('row_id').rename(columns = {'rating_num' : 'rating5'}) * 0.08

subm = subm1.join(subm2).join(subm3).join(subm4).join(subm5)
subm['rating_num'] = subm.sum(axis = 1).max()
subm = subm.reset_index()[['row_id', 'rating_num']]

subm['rating_num'] = (10 * (subm['rating_num'] - subm['rating_num'].min()) /\
                        (subm['rating_num'].max() - subm['rating_num'].min())).round(3)

if check_test(subm):
    subm.to_csv(f"submission_test.csv", index = False)