In [1]:
import numpy as np
import pandas as pd
import json
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from nltk.tokenize import word_tokenize

import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

import tensorflow as tf

In [2]:
with open("reviewSelected100.json", 'r') as read_file:
    data = [json.loads(line) for line in read_file]

In [3]:
#Store all reviews in a Pandas DataFrame 
reviews = []
for rev in range(len(data)):
    reviews.append(data[rev]['text'])
    
rev_df = pd.DataFrame(reviews, columns = ['Reviews'])
rev_df.head()

Unnamed: 0,Reviews
0,We had my Mother's Birthday Party here on 10/2...
1,Good Korean grill near Eaton Centre. The marin...
2,Was recommended to try this place by few peopl...
3,Ambience: Would not expect something this nice...
4,Absolutely the WORST pool company that I have ...


In [4]:
#Clean data 
import re

#1. Removes Punctuations
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

#2. Removes HTML syntaxesn(if reviews provide business links)
def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

#3. Removes URL data (if reviews provide business links)
def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#4. Removes Emojis (if reviews contains expressions)
def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#5. Lemmatize the corpus
def lemma_traincorpus(data):
    lemmatizer=WordNetLemmatizer()
    out_data=""
    for words in data:
        out_data+= lemmatizer.lemmatize(words)
    return out_data

rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_punctuations(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_html(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_url(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_emoji(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: lemma_traincorpus(z))

In [5]:
rev_df_clean = rev_df
rev_df_clean.head()

Unnamed: 0,Reviews
0,We had my Mothers Birthday Party here on 10291...
1,Good Korean grill near Eaton Centre The marina...
2,Was recommended to try this place by few peopl...
3,Ambience Would not expect something this nice ...
4,Absolutely the WORST pool company that I have ...


In [6]:
#Feature Engineering 

'''
Separate the reviews into positive and negative using nltk sentiment vader
'''
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk_sent = SentimentIntensityAnalyzer()

rev_df_clean['ratings'] = rev_df_clean["Reviews"].apply(lambda x: nltk_sent.polarity_scores(x))
rev_df_new = pd.concat([rev_df_clean.drop(['ratings'], axis=1), rev_df_clean['ratings'].apply(pd.Series)], axis=1)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/esther/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
rev_df_new['comp_rating'] = rev_df_new['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

rev_df_new.head()

Unnamed: 0,Reviews,neg,neu,pos,compound,comp_rating
0,We had my Mothers Birthday Party here on 10291...,0.0,0.592,0.408,0.93,pos
1,Good Korean grill near Eaton Centre The marina...,0.055,0.736,0.208,0.9448,pos
2,Was recommended to try this place by few peopl...,0.006,0.687,0.307,0.9975,pos
3,Ambience Would not expect something this nice ...,0.094,0.759,0.148,0.8318,pos
4,Absolutely the WORST pool company that I have ...,0.09,0.885,0.026,-0.9402,neg


In [8]:
upd_df = rev_df_new.drop(columns=['neg','neu','pos','compound'])
upd_df.head()

Unnamed: 0,Reviews,comp_rating
0,We had my Mothers Birthday Party here on 10291...,pos
1,Good Korean grill near Eaton Centre The marina...,pos
2,Was recommended to try this place by few peopl...,pos
3,Ambience Would not expect something this nice ...,pos
4,Absolutely the WORST pool company that I have ...,neg


In [9]:
upd_df['comp_rating'].value_counts()

pos    12946
neg     2354
Name: comp_rating, dtype: int64

#### Observation: Dataset contains a much larger number of positive reviews. 

In [10]:
from keras.preprocessing.text import Tokenizer

X = np.array(upd_df['Reviews'])
y = upd_df['comp_rating']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

Using TensorFlow backend.


In [12]:
#ensure all sentences are same size 
X = pad_sequences(X, maxlen=500)

In [13]:
X = np.array(X)
y = np.array(y)

# to convert labels to integers and vice-versa
label2int = {"pos": 1, "neg": 0}
int2label = {1: "pos", 0: "neg"}

In [14]:
y = [ label2int[label] for label in y ]
y = to_categorical(y)

In [15]:
train_x,test_x,train_y,test_y=train_test_split(X,y,test_size=0.2,random_state=42)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((12240, 500), (12240, 2), (3060, 500), (3060, 2))

In [16]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(train_x, train_y)
print ("Accuracy: %s" 
       % metrics.accuracy_score(test_y, rf.predict(test_x)))

Accuracy: 0.8477124183006536


In [19]:
test_x

array([[    0,     0,     0, ...,  1529, 14783,  3268],
       [    0,     0,     0, ...,     3,   136, 13337],
       [    0,     0,     0, ...,  1564,     2,   234],
       ...,
       [    0,     0,     0, ...,   450,   357,     9],
       [    0,     0,     0, ...,   231,     4,   104],
       [    0,     0,     0, ...,  4836,     2,    30]], dtype=int32)