In [1]:
import numpy as np
import pandas as pd


In [2]:
data = pd.read_csv('D:/MANIPAL/study/datasets/yelp_labelled.csv', sep='\t').reset_index()

In [3]:
data.shape

(1000, 2)

In [38]:
data.head()

Unnamed: 0,review,review sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [42]:
data.rename(columns={'index':'review',
                    'review  sentiment':'sentiment'}, inplace=True)

In [43]:
docs = data['review'].str.lower().str.replace('[^a-z @#]', '')

In [44]:
docs.head()

0                                 wow loved this place
1                                    crust is not good
2             not tasty and the texture was just nasty
3    stopped by during the late may bank holiday of...
4    the selection on the menu was great and so wer...
Name: review, dtype: object

In [46]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords
len(stopwords) # we have 179 stopwords in english

179

In [47]:
stemmer = nltk.stem.PorterStemmer()

def clean_doc(doc):
    words = nltk.word_tokenize(doc)
    words_clean = [stemmer.stem(word) for word in words if word not in stopwords]
    doc_clean = ' '.join(words_clean)
    return doc_clean
docs_clean = docs.apply(clean_doc)

In [48]:
docs_clean.head()

0                                       wow love place
1                                           crust good
2                                   tasti textur nasti
3    stop late may bank holiday rick steve recommen...
4                              select menu great price
Name: review, dtype: object

In [49]:
# Conversion of text column to numerical values: Document Term Matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split


train, test = train_test_split(docs_clean,
                               test_size=0.2,
                               random_state=1)

vectorizer = CountVectorizer(min_df=2)
vectorizer.fit(train)
train_x = pd.DataFrame(vectorizer.transform(train).toarray(),
                       columns=vectorizer.get_feature_names())
test_x = pd.DataFrame(vectorizer.transform(test).toarray(),
                       columns=vectorizer.get_feature_names())

train_x.shape

(800, 615)

In [50]:
train_y = data.loc[train.index, 'sentiment']
test_y = data
.loc[test.index, 'sentiment']

In [57]:
# Model Building
## Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score
mn_model = MultinomialNB()
mn_model.fit(train_x, train_y)
test_pred = mn_model.predict(test_x)
print(accuracy_score(test_y, test_pred))

0.71


In [59]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
rf_model= AdaBoostClassifier(random_state=100, n_estimators=50)
rf_model.fit(train_x,train_y)
rf_predict = rf_model.predict(test_x)
accuracy_score(test_y, rf_predict)


0.76

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf_model= RandomForestClassifier(random_state=100, n_estimators=100)
rf_model.fit(train_x,train_y)
rf_predict = rf_model.predict(test_x)
accuracy_score(test_y, rf_predict)


0.755

In [61]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
def get_sentiment(doc):
    score = sentiment.polarity_scores(doc)['compound']
    if score > 0:
        return 1
    else:
        return 0
sent_pred = data['review'].apply(get_sentiment)
print(accuracy_score(data['sentiment'], sent_pred))



0.812
