# Decision Tree Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 1.1 Load Data

In [2]:
train = pd.read_csv("train_engineered.csv")
val = pd.read_csv("val_engineered.csv")

# 1.2 PreProcessing

In [3]:
X_train, y_train = train.drop("blueWins", axis=1), train.blueWins
X_val, y_val = val.drop("blueWins", axis=1), val.blueWins

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

pca = PCA(n_components=5, random_state=42)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)

# 2.1 Hyper Parameter Tuning

In [5]:
clf = DecisionTreeClassifier()
params = {"max_depth":[3, 4, 5]}

gs = GridSearchCV(clf, params)
gs.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None, param_grid={'max_depth': [3, 4, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scori

In [6]:
gs.best_params_

{'max_depth': 4}

In [7]:
model = gs.best_estimator_

# 3.1 Evaluation

In [8]:
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

print("training score:", accuracy_score(y_train, train_pred))
print("validation score:", accuracy_score(y_val, val_pred))

training score: 0.7361411087113031
validation score: 0.7220723151645979
