# KNN Model Notebook

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("./data/train.csv")
df.head(5)

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


I'm going to trim the data and create a sample, so that I can gridsearch my data effectively.

In [34]:
df_sample = df[:2000]

In [35]:
df_sample.shape

(2000, 3)

In [36]:
X = df_sample[['question_text']]
y = df_sample['target']

In [37]:
X.head()

Unnamed: 0,question_text
0,How did Quebec nationalists see their province...
1,"Do you have an adopted dog, how would you enco..."
2,Why does velocity affect time? Does velocity a...
3,How did Otto von Guericke used the Magdeburg h...
4,Can I convert montra helicon D to a mountain b...


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

In [39]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate our CountVectorizer.
cvec = CountVectorizer(max_features = 50, stop_words = 'english')

In [40]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['question_text']).todense(),
                          columns = cvec.get_feature_names())

In [41]:
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['question_text']).todense(),
                         columns = cvec.get_feature_names())

In [42]:
X_train_cvec.head()

Unnamed: 0,ask,best,better,business,buy,change,college,countries,country,did,...,trump,use,used,want,way,women,work,world,year,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### The Decision Tree Model:

In [43]:
# Import model.
from sklearn.neighbors import KNeighborsClassifier

In [44]:
# Instantiate model
knn = KNeighborsClassifier()

In [45]:
# Fit model.
knn.fit(X_train_cvec, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [46]:
from sklearn.model_selection import cross_val_score

In [47]:
cross_val_score(knn, X_train_cvec, y_train, cv = 15).mean()

0.9367032036536986

In [48]:
# Import the confusion matrix function.

from sklearn.metrics import confusion_matrix

In [49]:
# Generate our predictions.

predictions = knn.predict(X_test_cvec)

In [50]:
# Generate a confusion matrix.

confusion_matrix(y_test, predictions)

array([[465,   4],
       [ 31,   0]])

In [51]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

In [52]:
print("True Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)

True Negatives: 465
False Positives: 4
False Negatives: 31
True Positives: 0


In [54]:
accuracy = (tp + tn)/(tn+fp+fn+tp)
print("Accuracy: %s" % accuracy)
precision = tp/(tp+fp)
print("Precision: %s" % precision)
recall = tp/(tp+fn)
print("Recall: %s" % recall)
f1_score = (precision*recall)/(precision+recall)
print("F1 Score: %s" % f1_score)

Accuracy: 0.93
Precision: 0.0
Recall: 0.0
F1 Score: nan


  import sys


In [53]:
import sys