In [50]:
import pandas as pd
import numpy as np

from matches_clean import clean_player_data, clean_matches_data
from decision_tree import tree_model

from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

# Retrieve and merge data

In [3]:
matches = clean_matches_data()

In [4]:
players = clean_player_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["obs_placed"] = df_clean["obs_placed"].replace(np.nan, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["roshans_killed"] = df_clean["roshans_killed"].replace(np.nan, 0)


In [7]:
df = pd.merge(matches, players, on = "match_id")

In [27]:
X = df.drop(columns = ["match_id", "radiant_win", "hero_id", "account_id", "isRadiant", "win"])
y = df["hero_id"]

In [30]:
X_train, X_test,y_train, y_test = train_test_split(X,y)

## Scale the data

In [83]:
scaler = StandardScaler()

In [84]:
scaler.fit(X_train, y_train)

In [85]:
X_train_scaled = scaler.transform(X_train)

# Baseline model

In [34]:
tree = tree_model(X_train_scaled, y_train)

In [36]:
X_test_scaled = scaler.transform(X_test)

In [37]:
tree.score(X_test_scaled,y_test)

0.04411764705882353

# Fine-tuning

In [86]:
tree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [87]:
grid = {"max_depth": [5,20,100,300,None],
          "min_samples_split": [1,2,5,20],
          "min_samples_leaf": [1,5,15,100],
          "max_features":[None, "auto", "sqrt", "log2"]}

In [88]:
search = GridSearchCV(DecisionTreeClassifier(), grid, scoring = "accuracy", n_jobs = -1)

In [89]:
search.fit(X_train_scaled, y_train)









400 fits failed out of a total of 1600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "/root/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 969, in fit
    super().fit(
  File "/root/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 265, in fit
    check_scalar(
  File "/root/.pyenv/versions/3.8.12/envs/lewagon/lib/python3.8/site-packages/sklearn/utils/validation.py"

In [90]:
search.best_params_

{'max_depth': 300,
 'max_features': None,
 'min_samples_leaf': 15,
 'min_samples_split': 20}

In [91]:
search.best_score_

0.07072346179851251

## Fine-grained grid search

In [97]:
grid = {"max_depth": [200,300,400],
          "min_samples_split": [10,20,50,100],
          "min_samples_leaf": [10,15,20,25]
       }

In [98]:
search = GridSearchCV(DecisionTreeClassifier(max_features = None), grid, scoring = "accuracy", n_jobs = -1)

search.fit(X_train_scaled, y_train)



In [99]:
search.best_score_

0.07758620689655174

In [100]:
search.best_params_

{'max_depth': 200, 'min_samples_leaf': 10, 'min_samples_split': 100}

Even more fine-grained:

In [101]:
grid = {"max_depth": [150,200,250],
        "min_samples_split": [80,100,150,300],
        "min_samples_leaf": [8,9,10,11,12]
       }

In [102]:
search = GridSearchCV(DecisionTreeClassifier(max_features = None), grid, scoring = "accuracy", n_jobs = -1)
search.fit(X_train_scaled, y_train)



In [103]:
search.best_score_

0.07758620689655174

In [104]:
search.best_params_

{'max_depth': 150, 'min_samples_leaf': 8, 'min_samples_split': 100}

Another round:

In [105]:
grid = {"max_depth": [120,150,170],
        "min_samples_split": [90,100,110,120],
        "min_samples_leaf": [6,7,8]
       }

In [106]:
search = GridSearchCV(DecisionTreeClassifier(max_features = None, min_samples_leaf=5), grid, scoring = "accuracy", n_jobs = -1)
search.fit(X_train_scaled, y_train)



In [107]:
search.best_score_

0.07954699121027722

In [108]:
search.best_params_

{'max_depth': 120, 'min_samples_leaf': 7, 'min_samples_split': 90}

## Best parameters

In [109]:
max_depth_best = 120
min_samples_split_best = 90
min_samples_leaf_best = 7
max_features_best = None 

# New Model

In [110]:
tree_best = tree_model(X_train_scaled, y_train,
                       min_samples_split=min_samples_split_best,
                       max_depth = max_depth_best,
                       max_features=max_features_best,
                       min_samples_leaf = min_samples_leaf_best)
                       

In [111]:
tree_best.score(X_test_scaled, y_test)

0.06470588235294118