In [1]:
import pandas as pd
import numpy as np
import re
import json
import string
import spacy

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English


In [2]:
punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

In [3]:

def spacy_tokenizer(sent):
    tokens = parser(sent)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations]
    return tokens

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}


def clean_text(text):
  
    return text.strip().lower()

In [18]:
spacy_tokenizer('Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.')

['total',
 'bill',
 'horrible',
 'service',
 '8gs',
 'crooks',
 'actually',
 'nerve',
 'charge',
 '69',
 '3',
 'pills',
 'checked',
 'online',
 'pills',
 '19',
 'cents',
 'avoid',
 'hospital',
 'ers',
 'costs']

In [4]:
with open("_data/reviews.json", encoding='utf-8', errors='ignore') as json_data:
     data = json.load(json_data, strict=False)

In [5]:
fix = [v for v in data['reviews']]

In [6]:
reviews=pd.DataFrame.from_dict(fix, orient='columns', dtype=None, columns=None)

In [7]:
#reviews.head()

reviews.info()

#reviews.stars.value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 659554 entries, 0 to 659553
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   review_id    659554 non-null  object 
 1   user_id      659554 non-null  object 
 2   business_id  659554 non-null  object 
 3   stars        659554 non-null  float64
 4   useful       659554 non-null  int64  
 5   funny        659554 non-null  int64  
 6   cool         659554 non-null  int64  
 7   text         659554 non-null  object 
 8   date         659554 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 45.3+ MB


In [8]:
reviews = reviews[reviews.stars != 3]

In [9]:
reviews['sentiment'] = np.where(reviews.stars>3, 1, 0)

In [10]:
reviews.stars.value_counts()

5.0    290558
4.0    144186
1.0     99326
2.0     53249
Name: stars, dtype: int64

In [17]:
reviews.text[0]

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'

In [12]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [13]:
from sklearn.model_selection import train_test_split

X = reviews['text'] # the features we want to analyze
ylabels = reviews['sentiment'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)


In [15]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])


pipe.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f84ca9fca10>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f84f6458170>)),
                ('classifier', LogisticRegression())])

In [16]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9517298917115031
Logistic Regression Precision: 0.9595268590371431
Logistic Regression Recall: 0.9759906813496716


In [19]:
import pickle

In [21]:
pickle.dump(pipe, open("tfid_over_reviews.pkl", 'wb'))

In [22]:
#model = pickle.load(open(model_file_path, 'rb'))
#result_val = model.score(xval, yval)
#result_test = model.score(xtest, ytest)

In [37]:
result = pipe.predict("I dont really like it, they were too bad people")

In [38]:
result

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1])