In [3]:
import pandas as pd
import json
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [7]:
with open('review.json', 'r') as f:
    data = [json.loads(row) for row in f]
    
reviews = pd.DataFrame(data)
reviews = reviews.drop(["cool", "date", "funny", "review_id", "useful", "user_id"],1)
reviews.head(5)

Unnamed: 0,business_id,stars,text
0,ujmEBvifdJM6h6RLv4wQIg,1.0,Total bill for this horrible service? Over $8G...
1,NZnhc2sEQy3RmzKTZnqtwQ,5.0,I *adore* Travis at the Hard Rock's new Kelly ...
2,WTqjgwHlXbSFevF32_DJVw,5.0,I have to say that this office really has it t...
3,ikCg8xy5JIg_NGPx-MSIDA,5.0,Went in for a lunch. Steak sandwich was delici...
4,b1b1eb3uo-w561D0ZfCEiQ,1.0,Today was my second out of three sessions I ha...


In [8]:
reviews.shape

(6685900, 3)

In [9]:
reviews = reviews[:100000]
reviews.shape

(100000, 3)

In [10]:
def clean_review(review):
    letters_only = re.sub('[^a-zA-Z]', ' ', review)
    words = letters_only.lower().split()
    stopwords_eng = set(stopwords.words("english"))
    useful_words = [x for x in words if not x in stopwords_eng]
    
    # Combine words into a paragraph again
    useful_words_string = ' '.join(useful_words)
    return(useful_words_string)

stemmer = PorterStemmer()

def stem_words(words_list, stemmer):
    return [stemmer.stem(word) for word in words_list]

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_words(tokens, stemmer)
    return stems

In [11]:
# CLEANING THE REVIEWS - REMOVAL OF STOPWORDS AND PUNCTUATION
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [12]:
reviews['new_review'] = reviews['text'].apply(clean_review)
reviews['new_star'] = reviews['stars'].apply(lambda x: 1 if x > 3.3 else -1 if x < 1.67 else 0)

In [13]:
reviews.head(20)

Unnamed: 0,business_id,stars,text,new_review,new_star
0,ujmEBvifdJM6h6RLv4wQIg,1.0,Total bill for this horrible service? Over $8G...,total bill horrible service gs crooks actually...,-1
1,NZnhc2sEQy3RmzKTZnqtwQ,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,adore travis hard rock new kelly cardenas salo...,1
2,WTqjgwHlXbSFevF32_DJVw,5.0,I have to say that this office really has it t...,say office really together organized friendly ...,1
3,ikCg8xy5JIg_NGPx-MSIDA,5.0,Went in for a lunch. Steak sandwich was delici...,went lunch steak sandwich delicious caesar sal...,1
4,b1b1eb3uo-w561D0ZfCEiQ,1.0,Today was my second out of three sessions I ha...,today second three sessions paid although firs...,-1
5,eU_713ec6fTGNO4BegRaww,4.0,I'll be the first to admit that I was not exci...,first admit excited going la tavolta food snob...,1
6,3fw2X5bZYeW9xCz_zGhOHg,3.0,Tracy dessert had a big name in Hong Kong and ...,tracy dessert big name hong kong one first mar...,0
7,zvO-PJCpNk4fgAVUnExYAA,1.0,This place has gone down hill. Clearly they h...,place gone hill clearly cut back staff food qu...,-1
8,b2jN2mm9Wf3RcrZCgfo1cg,2.0,I was really looking forward to visiting after...,really looking forward visiting beers man war ...,0
9,oxwGyA17NL6c5t1Etg5WgQ,3.0,It's a giant Best Buy with 66 registers. I do...,giant best buy registers get big deal place,0


In [14]:
def transform_to_features(data):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(
        analyzer = 'word',
        lowercase = False,
    )
    features = vectorizer.fit_transform(
        data
    )
    features_nd = features.toarray()
    return features_nd

In [15]:
feature = transform_to_features(reviews["new_review"])

In [18]:
len(feature)

100000

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test  = train_test_split(
        feature, 
        reviews['new_star'],
        train_size=0.80,
        test_size=0.20,
        random_state=123)

In [20]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

log_model = log_model.fit(X=x_train, y=y_train)
y_pred = log_model.predict(x_test)

In [21]:
from sklearn.metrics import accuracy_score
print("Accuracy={}".format(accuracy_score(y_test,y_pred)))

Accuracy=0.827


In [22]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(log_model, x_train, y_train)

In [23]:
print('Cross Validation Scores: ', score)

Cross Validation Scores:  [0.82105895 0.82266472 0.82223889]


In [24]:
y_pred = []
y_pred = log_model.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

         -1       0.76      0.72      0.74      2871
          0       0.62      0.51      0.56      3797
          1       0.89      0.94      0.91     13332

avg / total       0.82      0.83      0.82     20000

