# 3.34 Modeling: Random Forest

In [1]:
# load the data
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [2]:
# import LogisticRegression related pacakges
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Create Lemmatize and Tokenize rules

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [4]:
# use regularexpression to do lemmatize
# use countvectorizer to tokenize, lemmatize, and exclude stopwords
# cvec = CountVectorizer(tokenizer=LemmaTokenizer()) 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenizer = RegexpTokenizer('(?u)\\b\\w\\w+\\b')
        return [self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)]

In [5]:
X_train.shape

(1474,)

### CountVectorizer

In [9]:
# Instantiate Pipeline
pipeline_cv = Pipeline([('cvec', CountVectorizer(tokenizer=LemmaTokenizer())),
                        ('rf', RandomForestClassifier()) 
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'cvec__max_features': [300, 500, 1000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english']
}

In [10]:
# GridSearch
gs_cv = GridSearchCV(pipeline_cv, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

rf_cv = gs_cv.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   10.0s
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:   11.9s finished
  'stop_words.' % sorted(inconsistent))


In [11]:
# Best parameters on the training data:
rf_cv.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 500,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [13]:
# assign the best estimator to a variable:
best_rf_cv = rf_cv.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_rf_cv.score(X_train, y_train)}")
print(f"cross validation score is {rf_cv.best_score_}")
print(f"test score is {best_rf_cv.score(X_test, y_test)}")

training score is 0.9877883310719131
cross validation score is 0.9274084124830394
test score is 0.9146341463414634


### Tf-Idf Vectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# Instantiate Pipeline
pipeline_tfidf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer())),
                        ('rf', RandomForestClassifier())
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'tfidf__max_features': [300, 500, 1000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english']
}

In [17]:
# GridSearch
gs_tfidf = GridSearchCV(pipeline_tfidf, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

rf_tfidf = gs_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    7.7s
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:    9.6s finished
  'stop_words.' % sorted(inconsistent))


In [22]:
# Best parameters on the training data:
rf_tfidf.best_params_

{'knn__n_neighbors': 3,
 'tfidf__max_df': 0.9,
 'tfidf__max_features': 300,
 'tfidf__min_df': 2,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [18]:
# assign the best estimator to a variable:
best_rf_tfidf = rf_tfidf.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_rf_tfidf.score(X_train, y_train)}")
print(f"cross validation score is {rf_tfidf.best_score_}")
print(f"test score is {best_rf_tfidf.score(X_test, y_test)}")

training score is 0.9938941655359566
cross validation score is 0.9274084124830394
test score is 0.9024390243902439


Random Forest model performs really good in training set but the performance of testing set is not equivalent.