# NLP with Scikit-Learn

## Setup

### Import packages

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.todense()
    
from sklearn.preprocessing import Normalizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC

### Some light EDA

In [2]:
# load the data

df = pd.read_csv('yelp_reviews.csv')

In [3]:
# see the first 5 observations from the data

df.head()

Unnamed: 0,class,text
0,positive,Wow... Loved this place.
1,negative,Crust is not good.
2,negative,Not tasty and the texture was just nasty.
3,positive,Stopped by during the late May bank holiday of...
4,positive,The selection on the menu was great and so wer...


In [4]:
# see the unique classes

df['class'].unique()

array(['positive', 'negative'], dtype=object)

In [5]:
# see the class balance

df['class'].value_counts()

negative    500
positive    500
Name: class, dtype: int64

### Prepare the data from scikit-learn

In [6]:
X = df['text'].values
y = df['class'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

## Naive Bayes Models

### Categorical Naive Bayes

In [8]:
%%time

naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english', binary=True),
    MultinomialNB()
)

naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, naive_bayes.predict(X_test)))

Accuracy: 0.748 

              precision    recall  f1-score   support

    negative       0.81      0.69      0.74       134
    positive       0.69      0.82      0.75       116

    accuracy                           0.75       250
   macro avg       0.75      0.75      0.75       250
weighted avg       0.76      0.75      0.75       250

CPU times: user 25.5 ms, sys: 1.82 ms, total: 27.4 ms
Wall time: 26 ms


### Gausian Naive Bayes

In [9]:
%%time

gaussian_naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    DenseTransformer(),    
    GaussianNB()
)

gaussian_naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {gaussian_naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, gaussian_naive_bayes.predict(X_test)))

Accuracy: 0.7 

              precision    recall  f1-score   support

    negative       0.78      0.61      0.69       134
    positive       0.64      0.80      0.71       116

    accuracy                           0.70       250
   macro avg       0.71      0.71      0.70       250
weighted avg       0.72      0.70      0.70       250

CPU times: user 202 ms, sys: 20.5 ms, total: 223 ms
Wall time: 98.8 ms


### Categorical Bigram Naive Bayes

In [10]:
%%time

bigram_naive_bayes = make_pipeline(
    CountVectorizer(
        stop_words='english',
        binary=True,
        ngram_range=(1, 2)
    ),
    MultinomialNB()
)

bigram_naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {bigram_naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, bigram_naive_bayes.predict(X_test)))

Accuracy: 0.756 

              precision    recall  f1-score   support

    negative       0.83      0.69      0.75       134
    positive       0.70      0.84      0.76       116

    accuracy                           0.76       250
   macro avg       0.76      0.76      0.76       250
weighted avg       0.77      0.76      0.76       250

CPU times: user 145 ms, sys: 4.99 ms, total: 150 ms
Wall time: 56.8 ms


## Logistic Regression Models

### Word Count Logistic Regression

In [11]:
%%time

logistic_regression = make_pipeline(
    CountVectorizer(stop_words='english'),
    LogisticRegression()
)

logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {logistic_regression.score(X_test, y_test)} \n')
print(classification_report(y_test, logistic_regression.predict(X_test)))

Accuracy: 0.764 

              precision    recall  f1-score   support

    negative       0.77      0.79      0.78       134
    positive       0.75      0.73      0.74       116

    accuracy                           0.76       250
   macro avg       0.76      0.76      0.76       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 83.2 ms, sys: 2.12 ms, total: 85.3 ms
Wall time: 62.1 ms




### TF-IDF Logistic Regression

In [12]:
%%time

tfidf_logistic_regression = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LogisticRegression()
)

tfidf_logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {tfidf_logistic_regression.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_logistic_regression.predict(X_test)))

Accuracy: 0.772 

              precision    recall  f1-score   support

    negative       0.78      0.80      0.79       134
    positive       0.76      0.74      0.75       116

    accuracy                           0.77       250
   macro avg       0.77      0.77      0.77       250
weighted avg       0.77      0.77      0.77       250

CPU times: user 29.7 ms, sys: 1.93 ms, total: 31.6 ms
Wall time: 32.1 ms


### Normalized Word Count Logistic Regression

In [13]:
%%time

normalized_lr = make_pipeline(
    CountVectorizer(stop_words='english'),
    Normalizer(),
    LogisticRegression()
)

normalized_lr.fit(X_train, y_train)

print(f'Accuracy: {normalized_lr.score(X_test, y_test)} \n')
print(classification_report(y_test, normalized_lr.predict(X_test)))

Accuracy: 0.756 

              precision    recall  f1-score   support

    negative       0.76      0.81      0.78       134
    positive       0.76      0.70      0.73       116

    accuracy                           0.76       250
   macro avg       0.76      0.75      0.75       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 28 ms, sys: 1.83 ms, total: 29.9 ms
Wall time: 75.5 ms


### Bigram TF-IDF Logistic Regression

In [14]:
%%time

bigram_tfidf_logistic_regression = make_pipeline(
    CountVectorizer(
        stop_words='english',
        ngram_range=(1,2)
    ),
    TfidfTransformer(),
    LogisticRegression()
)

bigram_tfidf_logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {bigram_tfidf_logistic_regression.score(X_test, y_test)} \n')
print(classification_report(y_test, bigram_tfidf_logistic_regression.predict(X_test)))

Accuracy: 0.76 

              precision    recall  f1-score   support

    negative       0.78      0.78      0.78       134
    positive       0.74      0.74      0.74       116

    accuracy                           0.76       250
   macro avg       0.76      0.76      0.76       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 171 ms, sys: 2.7 ms, total: 173 ms
Wall time: 98.5 ms


### TF-IDF Logistic Regression with Automatic Corpus Specific Stop Words

In [15]:
%%time

mindf_max_tfidf_lr = make_pipeline(
    CountVectorizer(
        stop_words='english',
        min_df=2,
        max_df=.9
    ),
    TfidfTransformer(),
    LogisticRegression()
)

mindf_max_tfidf_lr.fit(X_train, y_train)

print(f'Accuracy: {mindf_max_tfidf_lr.score(X_test, y_test)} \n')
print(classification_report(y_test, mindf_max_tfidf_lr.predict(X_test)))

Accuracy: 0.756 

              precision    recall  f1-score   support

    negative       0.76      0.80      0.78       134
    positive       0.75      0.71      0.73       116

    accuracy                           0.76       250
   macro avg       0.76      0.75      0.75       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 69.2 ms, sys: 1.35 ms, total: 70.5 ms
Wall time: 26.6 ms


## Other Linear Seperation Models

In [16]:
%%time

tfidf_lda = make_pipeline(
    CountVectorizer(
        stop_words='english',
    ),
    TfidfTransformer(),
    DenseTransformer(),
    LinearDiscriminantAnalysis()
)

tfidf_lda.fit(X_train, y_train)

print(f'Accuracy: {tfidf_lda.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_lda.predict(X_test)))

Accuracy: 0.7 

              precision    recall  f1-score   support

    negative       0.73      0.71      0.72       134
    positive       0.67      0.69      0.68       116

    accuracy                           0.70       250
   macro avg       0.70      0.70      0.70       250
weighted avg       0.70      0.70      0.70       250

CPU times: user 1.13 s, sys: 58.3 ms, total: 1.19 s
Wall time: 396 ms




In [17]:
%%time

tfidf_svc = make_pipeline(
    CountVectorizer(
        stop_words='english',
    ),
    TfidfTransformer(),
    DenseTransformer(),
    LinearSVC()
)

tfidf_svc.fit(X_train, y_train)

print(f'Accuracy: {tfidf_svc.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_svc.predict(X_test)))

Accuracy: 0.768 

              precision    recall  f1-score   support

    negative       0.79      0.77      0.78       134
    positive       0.74      0.77      0.75       116

    accuracy                           0.77       250
   macro avg       0.77      0.77      0.77       250
weighted avg       0.77      0.77      0.77       250

CPU times: user 105 ms, sys: 1.95 ms, total: 107 ms
Wall time: 29.2 ms


## Logistic Lasso, Ridge and ElasticNet

### TF-IDF Logistic Lasso

In [18]:
%%time

tfidf_logistic_regression_lasso = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LogisticRegressionCV(
        penalty='l1',
        solver='saga' 
    )
)

tfidf_logistic_regression_lasso.fit(X_train, y_train)

print(f'Accuracy: {tfidf_logistic_regression_lasso.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_logistic_regression_lasso.predict(X_test)))



Accuracy: 0.764 

              precision    recall  f1-score   support

    negative       0.78      0.78      0.78       134
    positive       0.74      0.75      0.75       116

    accuracy                           0.76       250
   macro avg       0.76      0.76      0.76       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 4.7 s, sys: 6.56 ms, total: 4.7 s
Wall time: 4.28 s


### TF-IDF Logistic Ridge

In [19]:
%%time

tfidf_logistic_regression_ridge = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LogisticRegressionCV(
        penalty='l2',
        solver='saga' 
    )
)

tfidf_logistic_regression_ridge.fit(X_train, y_train)

print(f'Accuracy: {tfidf_logistic_regression_ridge.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_logistic_regression_ridge.predict(X_test)))

Accuracy: 0.78 

              precision    recall  f1-score   support

    negative       0.80      0.79      0.79       134
    positive       0.76      0.77      0.76       116

    accuracy                           0.78       250
   macro avg       0.78      0.78      0.78       250
weighted avg       0.78      0.78      0.78       250

CPU times: user 212 ms, sys: 2.14 ms, total: 214 ms
Wall time: 213 ms




### TF-IDF Logistic Elasticnet

In [20]:
%%time

tfidf_logistic_regression_elasticnet = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LogisticRegressionCV(
        penalty='elasticnet',
        l1_ratios=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1],
        solver='saga' 
    )
)

tfidf_logistic_regression_elasticnet.fit(X_train, y_train)

print(f'Accuracy: {tfidf_logistic_regression_elasticnet.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_logistic_regression_elasticnet.predict(X_test)))



Accuracy: 0.78 

              precision    recall  f1-score   support

    negative       0.80      0.79      0.79       134
    positive       0.76      0.77      0.76       116

    accuracy                           0.78       250
   macro avg       0.78      0.78      0.78       250
weighted avg       0.78      0.78      0.78       250

CPU times: user 43.3 s, sys: 34 ms, total: 43.3 s
Wall time: 43.4 s
