In [1]:
# Add a check if we run in google colab or locally in jupyter notebook
run_in_colab = False
if 'google.colab' in str(get_ipython()):
    run_in_colab = True
    print('Running on Colab')
else:
    print('Running locally on Jupyter')

Running on Colab


In [2]:
# Mount drive in google colab
if run_in_colab:
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
if run_in_colab:
    from google.colab import files
    uploaded = files.upload()

Saving train.csv to train.csv


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.metrics import r2_score
import numpy as np 
import pandas as pd
import sys
import io

In [5]:
df_train = pd.read_csv(io.BytesIO(uploaded['train.csv']),  encoding = 'iso-8859-8')

In [6]:
player_cols = [col for col in df_train.columns if 'player' in col]
team1_cols = [col for col in df_train.columns if 'team1' in col]
team2_cols = [col for col in df_train.columns if 'team2' in col]
my_team_cols = [sub.replace('team1', 'my') for sub in team1_cols]
op_team_cols = [sub.replace('team1', 'op') for sub in team1_cols]
df_train[my_team_cols] = 0
df_train[op_team_cols] = 0
for i in range(len(team1_cols)):
    df_train.loc[df_train.team == 'team1', my_team_cols[i]] = df_train.loc[df_train.team == 'team1', team1_cols[i]]
    df_train.loc[df_train.team == 'team1', op_team_cols[i]] = df_train.loc[df_train.team == 'team1', team2_cols[i]]
    df_train.loc[df_train.team == 'team2', my_team_cols[i]] = df_train.loc[df_train.team == 'team2', team2_cols[i]]
    df_train.loc[df_train.team == 'team2', op_team_cols[i]] = df_train.loc[df_train.team == 'team2', team1_cols[i]]

  self[col] = value


In [7]:
df_train = df_train.drop(team1_cols,axis=1)
df_train = df_train.drop(team2_cols,axis=1)

In [8]:
df_train = df_train.dropna(subset=["player_position_1","player_position_2"])
col_ind = []
missing_p = [0] * df_train.shape[1]
for i in range(df_train.shape[1]):
    # count number of rows with missing values
    n_miss = df_train.iloc[:, [i]].isnull().sum()
    perc = n_miss / df_train.shape[0] * 100
    missing_p[i] = perc
    if perc[0] >= 80:
        col_ind.append(i)
df_train.drop(df_train.columns[col_ind], axis=1, inplace=True)
df_train = df_train.drop('team',axis=1)

In [9]:
gmean = df_train.groupby(['winner','player_position_1','player_position_2','rating_num'])[["my_system_id",'op_system_id']].transform(lambda x: x.mode()[0])
df_train[["my_system_id",'op_system_id']] = df_train[["my_system_id",'op_system_id']].fillna(gmean)

In [10]:
df_train['player_position_1'] = df_train['player_position_1'].astype('category')
df_train['player_position_2'] = df_train['player_position_2'].astype('category')
df_train['my_system_id'] = df_train['my_system_id'].astype('category')
df_train['op_system_id'] = df_train['op_system_id'].astype('category')
df_train['scout_id'] = df_train['scout_id'].astype('category')
df_train['competitionId'] = df_train['competitionId'].astype('category')
df_train = pd.get_dummies(df_train, columns = ["winner"],prefix="winner")

In [14]:
categorial_cols = ["player_position_1","player_position_2","my_system_id","op_system_id","scout_id","competitionId"]
df_tofill = df_train[[col for col in df_train.columns if col not in categorial_cols]]

In [15]:
imputer = KNNImputer(n_neighbors=5)
df_train_tofill_1 = imputer.fit_transform(df_tofill)
df_train_final = pd.DataFrame(df_train_tofill_1, columns=df_tofill.columns)

In [16]:
df_train[[col for col in df_train.columns if col not in categorial_cols]] = df_train_final

In [17]:
df_train = df_train.dropna()

In [18]:
df_train_x = df_train.drop(["row_id","rating_num"],axis =1)
df_train_y = df_train[["rating_num"]]

In [19]:
df_train_x['player_position_1'] = df_train['player_position_1'].astype('category')
df_train_x['player_position_2'] = df_train['player_position_2'].astype('category')
df_train_x['my_system_id'] = df_train['my_system_id'].astype('category')
df_train_x['op_system_id'] = df_train['op_system_id'].astype('category')
df_train_x['scout_id'] = df_train['scout_id'].astype('category')
df_train_x['competitionId'] = df_train['competitionId'].astype('category')

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.30, random_state=40)

In [23]:
# Instantiate model with 1024 decision trees
rf = RandomForestRegressor(n_estimators = 1024, random_state = 42)
# Train the model on training data
labels = np.array(y_train)
features = np.array(X_train)

rf.fit(features, np.ravel(labels,order="c"));

In [24]:
pred_train_rf= rf.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print(r2_score(y_train, pred_train_rf))

pred_test_rf= rf.predict(X_test)
pred_test_rf2 = pred_test_rf
print(np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print(r2_score(y_test, pred_test_rf))

  f"X has feature names, but {self.__class__.__name__} was fitted without"


0.5782522203209576
0.9021880956652897


  f"X has feature names, but {self.__class__.__name__} was fitted without"


1.558671177912052
0.3016279439774071


In [25]:
pred_test_rf

array([7.10742188, 6.68457031, 7.50585938, ..., 6.7734375 , 5.51757812,
       6.40234375])