# NLP with Scikit-Learn

## Setup

### Import packages

In [26]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import TransformerMixin

class DenseTransformer(TransformerMixin):
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.todense()
    
from sklearn.preprocessing import Normalizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

### Some light EDA

In [2]:
# load the data

df = pd.read_csv('yelp_reviews.csv')

In [3]:
# see the first 5 observations from the data

df.head()

Unnamed: 0,class,text
0,positive,Wow... Loved this place.
1,negative,Crust is not good.
2,negative,Not tasty and the texture was just nasty.
3,positive,Stopped by during the late May bank holiday of...
4,positive,The selection on the menu was great and so wer...


In [4]:
# see the unique classes

df['class'].unique()

array(['positive', 'negative'], dtype=object)

In [5]:
# see the class balance

df['class'].value_counts()

positive    500
negative    500
Name: class, dtype: int64

### Prepare the data from scikit-learn

In [6]:
X = df['text'].values
y = df['class'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

## Naive Bayes Models

### Categorical Naive Bayes

In [12]:
%%time

naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english', binary=True),
    MultinomialNB()
)

naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, naive_bayes.predict(X_test)))

Accuracy: 0.748 

              precision    recall  f1-score   support

    negative       0.81      0.69      0.74       134
    positive       0.69      0.82      0.75       116

   micro avg       0.75      0.75      0.75       250
   macro avg       0.75      0.75      0.75       250
weighted avg       0.76      0.75      0.75       250

CPU times: user 25.1 ms, sys: 3.55 ms, total: 28.7 ms
Wall time: 33 ms


### Gausian Naive Bayes

In [9]:
%%time

gaussian_naive_bayes = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    DenseTransformer(),    
    GaussianNB()
)

gaussian_naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {gaussian_naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, gaussian_naive_bayes.predict(X_test)))

Naive Bayes
Accuracy: 0.7 

              precision    recall  f1-score   support

    negative       0.78      0.61      0.69       134
    positive       0.64      0.80      0.71       116

   micro avg       0.70      0.70      0.70       250
   macro avg       0.71      0.71      0.70       250
weighted avg       0.72      0.70      0.70       250

CPU times: user 248 ms, sys: 16 ms, total: 264 ms
Wall time: 111 ms


### Categorical Bigram Naive Bayes

In [13]:
%%time

bigram_naive_bayes = make_pipeline(
    CountVectorizer(
        stop_words='english',
        binary=True,
        ngram_range=(1, 2)
    ),
    MultinomialNB()
)

naive_bayes.fit(X_train, y_train)

print(f'Accuracy: {bigram_naive_bayes.score(X_test, y_test)} \n')
print(classification_report(y_test, bigram_naive_bayes.predict(X_test)))

Accuracy: 0.756 

              precision    recall  f1-score   support

    negative       0.83      0.69      0.75       134
    positive       0.70      0.84      0.76       116

   micro avg       0.76      0.76      0.76       250
   macro avg       0.76      0.76      0.76       250
weighted avg       0.77      0.76      0.76       250

CPU times: user 67 ms, sys: 27.8 ms, total: 94.8 ms
Wall time: 94 ms


## Logistic Regression Models

### Word Count Logistic Regression

In [17]:
%%time

logistic_regression = make_pipeline(
    CountVectorizer(stop_words='english'),
    LogisticRegression()
)

logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {logistic_regression.score(X_test, y_test)} \n')
print(classification_report(y_test, logistic_regression.predict(X_test)))

Accuracy: 0.764 

              precision    recall  f1-score   support

    negative       0.77      0.79      0.78       134
    positive       0.75      0.73      0.74       116

   micro avg       0.76      0.76      0.76       250
   macro avg       0.76      0.76      0.76       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 25.8 ms, sys: 1.94 ms, total: 27.7 ms
Wall time: 26.5 ms


### TF-IDF Logistic Regression

In [20]:
%%time

tfidf_logistic_regression = make_pipeline(
    CountVectorizer(stop_words='english'),
    TfidfTransformer(),
    LogisticRegression()
)

tfidf_logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {tfidf_logistic_regression.score(X_test, y_test)} \n')
print(classification_report(y_test, tfidf_logistic_regression.predict(X_test)))

Accuracy: 0.772 

              precision    recall  f1-score   support

    negative       0.78      0.80      0.79       134
    positive       0.76      0.74      0.75       116

   micro avg       0.77      0.77      0.77       250
   macro avg       0.77      0.77      0.77       250
weighted avg       0.77      0.77      0.77       250

CPU times: user 22.9 ms, sys: 1.24 ms, total: 24.1 ms
Wall time: 23.3 ms


### Normalized Word Count Logistic Regression

In [40]:
%%time

normalized_lr = make_pipeline(
    CountVectorizer(stop_words='english'),
    Normalizer(),
    LogisticRegression()
)

normalized_lr.fit(X_train, y_train)

print(f'Accuracy: {normalized_lr.score(X_test, y_test)} \n')
print(classification_report(y_test, normalized_lr.predict(X_test)))

Accuracy: 0.756 

              precision    recall  f1-score   support

    negative       0.76      0.81      0.78       134
    positive       0.76      0.70      0.73       116

   micro avg       0.76      0.76      0.76       250
   macro avg       0.76      0.75      0.75       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 26.1 ms, sys: 1.77 ms, total: 27.9 ms
Wall time: 26.9 ms


### Bigram TF-IDF Logistic Regression

In [44]:
%%time

bigram_tfidf_logistic_regression = make_pipeline(
    CountVectorizer(
        stop_words='english',
        ngram_range=(1,2)
    ),
    TfidfTransformer(),
    LogisticRegression()
)

bigram_tfidf_logistic_regression.fit(X_train, y_train)

print(f'Accuracy: {bigram_tfidf_logistic_regression.score(X_test, y_test)} \n')
print(classification_report(y_test, bigram_tfidf_logistic_regression.predict(X_test)))

Accuracy: 0.76 

              precision    recall  f1-score   support

    negative       0.78      0.78      0.78       134
    positive       0.74      0.74      0.74       116

   micro avg       0.76      0.76      0.76       250
   macro avg       0.76      0.76      0.76       250
weighted avg       0.76      0.76      0.76       250

CPU times: user 56.6 ms, sys: 1.88 ms, total: 58.5 ms
Wall time: 35.1 ms
