# GridSearch Decision Tree Model Notebook

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/train.csv")
df.head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


I'm going to trim the data and create a sample, so that I can gridsearch my data effectively.

In [3]:
df_sample = df[:1000]

In [4]:
df_sample.shape

(1000, 3)

In [5]:
X = df_sample[['question_text']]
y = df_sample['target']

In [6]:
X.head()

Unnamed: 0,question_text
0,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco..."
2,Why does velocity affect time? Does velocity a...
3,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [8]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate our CountVectorizer.
cvec = CountVectorizer(max_features = 50, stop_words = 'english')

In [9]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['question_text']).todense(),
                          columns = cvec.get_feature_names())

In [10]:
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['question_text']).todense(),
                         columns = cvec.get_feature_names())

In [11]:
X_train_cvec.head()

Unnamed: 0,2018,ask,best,better,buy,change,college,company,computer,day,...,thing,think,time,trump,use,want,way,women,work,world
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### The Decision Tree Model:

In [12]:
# Import model.
from sklearn.tree import DecisionTreeClassifier

In [13]:
# Instantiate model.
dt = DecisionTreeClassifier(random_state = 42)

In [14]:
# Fit model.
dt.fit(X_train_cvec, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
cross_val_score(dt, X_train_cvec, y_train, cv = 5).mean()

0.9413234365971821

Now that we have a score, time to try a GridSearch to find the best possible model.

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
grid = GridSearchCV(estimator=DecisionTreeClassifier(),
                    param_grid={'max_depth': [None, 3, 5, 7, 10],
                                'min_samples_split': [2, 5, 10, 15, 20],
                                'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7]},
                    cv=5,
                    verbose = 1,
                    return_train_score = True)

In [19]:
import time

t0 = time.time()

grid.fit(X_train_cvec, y_train)

print(time.time() - t0)

Fitting 5 folds for each of 175 candidates, totalling 875 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


5.862330913543701


[Parallel(n_jobs=1)]: Done 875 out of 875 | elapsed:    5.8s finished


In [20]:
grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
grid.best_params_

{'max_depth': None, 'min_samples_leaf': 6, 'min_samples_split': 2}

In [22]:
grid.best_score_

0.952

In [23]:
# Import the confusion matrix function.

from sklearn.metrics import confusion_matrix

In [24]:
# Generate our predictions.

predictions = grid.predict(X_test_cvec)

In [25]:
# Generate a confusion matrix.

confusion_matrix(y_test, predictions)

array([[235,   3],
       [ 11,   1]])

In [26]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [27]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 235
False Positives: 3
False Negatives: 11
True Positives: 1


In [28]:
accuracy = (tp + tn)/(tn+fp+fn+tp)
print("Accuracy: %s" % accuracy)
precision = tp/(tp+fp)
print("Precision: %s" % precision)
recall = tp/(tp+fn)
print("Recall: %s" % recall)
f1_score = (precision*recall)/(precision+recall)
print("F1 Score: %s" % f1_score)

Accuracy: 0.944
Precision: 0.25
Recall: 0.08333333333333333
F1 Score: 0.0625
