# 3.33 Modeling: KNN

In [4]:
# load the data
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [5]:
# import LogisticRegression related pacakges
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Create Lemmatize and Tokenize rules

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [7]:
# use regularexpression to do lemmatize
# use countvectorizer to tokenize, lemmatize, and exclude stopwords
# cvec = CountVectorizer(tokenizer=LemmaTokenizer()) 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenizer = RegexpTokenizer('(?u)\\b\\w\\w+\\b')
        return [self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)]

In [8]:
X_train.shape

(1474,)

### CountVectorizer

In [13]:
# Instantiate Pipeline
pipeline_cv = Pipeline([('cvec', CountVectorizer(tokenizer=LemmaTokenizer())),
                        ('knn', KNeighborsClassifier()) 
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'cvec__max_features': [300, 500, 1000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english'],
    'knn__n_neighbors': [3, 5]
}

In [14]:
# GridSearch
gs_cv = GridSearchCV(pipeline_cv, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

knn_cv = gs_cv.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   14.3s
[Parallel(n_jobs=4)]: Done 480 out of 480 | elapsed:   18.5s finished
  'stop_words.' % sorted(inconsistent))


In [18]:
# Best parameters on the training data:
knn_cv.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 300,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'knn__n_neighbors': 5}

In [16]:
# assign the best estimator to a variable:
best_knn_cv = knn_cv.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_knn_cv.score(X_train, y_train)}")
print(f"cross validation score is {knn_cv.best_score_}")
print(f"test score is {best_knn_cv.score(X_test, y_test)}")

training score is 0.9308005427408412
cross validation score is 0.8887381275440976
test score is 0.8739837398373984


### Tf-Idf Vectorizer

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
# Instantiate Pipeline
pipeline_tfidf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer())),
                        ('knn', KNeighborsClassifier())
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'tfidf__max_features': [300, 500, 1000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english'],
    'knn__n_neighbors': [3, 5]
}

In [21]:
# GridSearch
gs_tfidf = GridSearchCV(pipeline_tfidf, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

knn_tfidf = gs_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   20.1s
[Parallel(n_jobs=4)]: Done 480 out of 480 | elapsed:   21.7s finished
  'stop_words.' % sorted(inconsistent))


In [22]:
# Best parameters on the training data:
knn_tfidf.best_params_

{'knn__n_neighbors': 3,
 'tfidf__max_df': 0.9,
 'tfidf__max_features': 300,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [23]:
# assign the best estimator to a variable:
best_knn_tfidf = knn_tfidf.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_knn_tfidf.score(X_train, y_train)}")
print(f"cross validation score is {knn_tfidf.best_score_}")
print(f"test score is {best_knn_tfidf.score(X_test, y_test)}")

training score is 0.9023066485753053
cross validation score is 0.8385345997286295
test score is 0.8089430894308943


Knn model here doesn't perform better than other models.