# 3.31 Modeling: Logistic Regression

In [2]:
# load the data
%store -r X_train
%store -r X_test
%store -r y_train
%store -r y_test

In [3]:
# import LogisticRegression related pacakges
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Create Lemmatize and Tokenize rules

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

In [5]:
# use regularexpression to do lemmatize
# use countvectorizer to tokenize, lemmatize, and exclude stopwords
# cvec = CountVectorizer(tokenizer=LemmaTokenizer()) 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        tokenizer = RegexpTokenizer('(?u)\\b\\w\\w+\\b')
        return [self.wnl.lemmatize(t) for t in tokenizer.tokenize(doc)]

### CountVectorizer

In [6]:
# Instantiate Pipeline
pipeline_cv = Pipeline([('cvec', CountVectorizer(tokenizer=LemmaTokenizer())),
                        ('lr', LogisticRegression('l1')) 
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'cvec__max_features': [300, 500, 1000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__stop_words': [None, 'english']
}

In [7]:
# GridSearch
gs_cv = GridSearchCV(pipeline_cv, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

lr_cv = gs_cv.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    9.8s
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:   11.7s finished
  'stop_words.' % sorted(inconsistent))


In [8]:
# Best parameters on the training data:
lr_cv.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 1000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [9]:
# assign the best estimator to a variable:
best_lr_cv = lr_cv.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_lr_cv.score(X_train, y_train)}")
print(f"cross validation score is {lr_cv.best_score_}")
print(f"test score is {best_lr_cv.score(X_test, y_test)}")

training score is 0.9606512890094979
cross validation score is 0.9185888738127544
test score is 0.9247967479674797


### Tf-Idf Vectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# Instantiate Pipeline
pipeline_tfidf = Pipeline([('tfidf', TfidfVectorizer(tokenizer=LemmaTokenizer())),
                        ('lr', LogisticRegression('l1')) 
                       ])
# Pipeline parameter CountVectorizer
pipe_params = {
    'tfidf__max_features': [300, 500, 1000],
    'tfidf__min_df': [2, 3],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english']
}

In [12]:
# GridSearch
gs_tfidf = GridSearchCV(pipeline_tfidf, 
                     param_grid=pipe_params, 
                     cv=5,
                     verbose=1,
                     n_jobs=4)

lr_tfidf = gs_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:    9.8s finished
  'stop_words.' % sorted(inconsistent))


In [13]:
# Best parameters on the training data:
lr_tfidf.best_params_

{'tfidf__max_df': 0.9,
 'tfidf__max_features': 300,
 'tfidf__min_df': 3,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [14]:
# assign the best estimator to a variable:
best_lr_tfidf = lr_tfidf.best_estimator_
# check training score, cross_validation_score and testing score
print(f"training score is {best_lr_tfidf.score(X_train, y_train)}")
print(f"cross validation score is {lr_tfidf.best_score_}")
print(f"test score is {best_lr_tfidf.score(X_test, y_test)}")

training score is 0.9443690637720489
cross validation score is 0.9145183175033921
test score is 0.9024390243902439


### Check coefficient to see which feature weights more

Since model under CountVectorizer(cv) performs better, I'll use cv to train the model.

In [30]:
# using the best estimator in gridsearch
cvec = CountVectorizer(tokenizer=LemmaTokenizer(),
                       max_df=0.9,
                       max_features=1000,
                       min_df=2,
                       ngram_range=(1, 2),
                       stop_words='english'
                      )

In [31]:
# use CountVectorizer to vectorize the training data
X_train_cvec= cvec.fit_transform(X_train)

  'stop_words.' % sorted(inconsistent))


In [32]:
# instantiate a logisticRegression model with lasso
lr = LogisticRegression('l1')
lr.fit(X_train_cvec, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
# convert X_train_cvec as a dataframe
import pandas as pd

cvec_words = pd.DataFrame(X_train_cvec.toarray(), columns = cvec.get_feature_names())

In [34]:
# add word counts to the dataframe
cv_table_lr = pd.DataFrame(cvec_words.sum(), index = cvec.get_feature_names(), columns=['word_count'])
    

In [35]:
# add coefficient to the table
import numpy as np

cv_table_lr['coef'] = lr.coef_.T

In [36]:
# reset index and change column names
cv_table_lr = cv_table_lr.reset_index()
cv_table_lr = cv_table_lr.rename({'index': 'word'}, axis='columns')

In [38]:
cv_table_lr.sort_values(by='coef', ascending=False).head(10)

Unnamed: 0,word,word_count,coef
834,spacex,361,4.094957
863,starlink,61,3.592743
263,dragon,87,3.503942
866,starship,85,3.417269
880,stp,21,3.397308
318,falcon,155,3.361128
712,raptor,30,2.915407
278,elon,71,2.507829
677,press,16,2.235897
874,station,19,2.188803


In [39]:
cv_table_lr.sort_values(by='coef', ascending=True).head(10)

Unnamed: 0,word,word_count,coef
108,blue,333,-3.918094
111,blueorigin,38,-3.032898
361,glenn,79,-2.462943
97,bezos,102,-2.251055
784,shepard,100,-1.894246
113,bo,9,-1.613705
56,amazon,17,-1.477089
437,jeff,89,-1.403821
695,provider,3,-1.367819
580,new,295,-1.351854


In [40]:
# store cv_table for data visualization
%store cv_table_lr

Stored 'cv_table_lr' (DataFrame)
