In [49]:
import pandas as pd
import nltk
from nltk.corpus import stopwords, sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from nltk.stem import WordNetLemmatizer

In [50]:
df = pd.read_csv('oceans_12Reviews.csv')

In [51]:
def give_sentiment(rating):
    if(rating > 5):
        return 'Positive'
    elif (rating == 5):
        return 'Neutral'
    else:
        return 'Negative'

In [52]:
df['overall_sentiment'] = df['ratings'].apply(lambda x: give_sentiment(x))

Pre Trained VADER sentiment classification

In [53]:
def vader_sentiment(text):
    analyser = SentimentIntensityAnalyzer()
    sentiment_score = analyser.polarity_scores(text)
    if(sentiment_score['compound'] > 0):
        return 'Positive'
    elif(sentiment_score['compound'] == 0):
        return 'Neutral'
    else:
        return 'Negative'

In [54]:
df['vader_sentiment'] = df['reviews_clean'].apply(lambda x: vader_sentiment(x))


In [55]:
print('Proportion of incorrectly classified reviews: ' + str(df[df['overall_sentiment'] != df['vader_sentiment']].size/df.size))

Proportion of incorrectly classified reviews: 0.516


Dropping stopwords

In [56]:
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    return ' '.join([w for w in nltk.word_tokenize(text) if w not in stopwords])


In [58]:
df['tokenized_no_stopwords_reviews'] = df['reviews_clean'].apply(lambda x: remove_stopwords(x))
df['vader_stopwords_classification'] = df['tokenized_no_stopwords_reviews'].apply(
    lambda x: vader_sentiment(x))
print('Proportion of incorrectly classified reviews: ' +
      str(df[df['overall_sentiment'] != df['vader_stopwords_classification']].size/df.size))


Proportion of incorrectly classified reviews: 0.636


In [59]:
cv = CountVectorizer()
text_vectorized = cv.fit_transform(df['tokenized_no_stopwords_reviews'])
xtrain, xtest, ytrain, ytest = train_test_split(text_vectorized, df['overall_sentiment'], test_size = 0.2, random_state=42)


In [60]:
mnb = MultinomialNB()
mnb.fit(xtrain, ytrain)
pred = mnb.predict(xtest)
accuracy_score = metrics.accuracy_score(pred, ytest)
print(accuracy_score)

0.53


In [61]:
cv = CountVectorizer(ngram_range=(1,2))
text_vectorized2 = cv.fit_transform(df['tokenized_no_stopwords_reviews'])
xtrain, xtest, ytrain, ytest = train_test_split(
    text_vectorized2, df['overall_sentiment'], test_size=0.2, random_state=42)
mnb.fit(xtrain, ytrain)
pred = mnb.predict(xtest)
accuracy_score = metrics.accuracy_score(pred, ytest)
print(accuracy_score)


0.51


In [57]:
def lemmatize(word):
    lem = WordNetLemmatizer()
    return lem.lemmatize(word)

In [62]:
df['lemmatized_text'] = df['tokenized_no_stopwords_reviews'].apply(
    lambda x: lemmatize(x))

In [66]:
cv = CountVectorizer(ngram_range=(1, 1))
text_vectorized3 = cv.fit_transform(df['lemmatized_text'])
xtrain, xtest, ytrain, ytest = train_test_split(
    text_vectorized3, df['overall_sentiment'], test_size=0.2, random_state=42)
mnb.fit(xtrain, ytrain)
pred = mnb.predict(xtest)
accuracy_score = metrics.accuracy_score(pred, ytest)
print(accuracy_score)


0.53


In [67]:
df.iloc[0]['lemmatized_text']

'movie downside many reviewers think think tries ambitious fault trying top predecessor oceans eleven evident gimmick raise level building three inches using underwater hydraulic contraptions couldnt follow logistics escapade much less think feasible sheer audacity idea sounded compelling get come linus caldwells matt damon idea lookie loo bundle joy brilliant else would come idea julia roberts impersonating julia roberts movie almost good bruce campbell portraying elvis presley turn impersonates elvis presley impersonator bubba ho tep try wrapping heard around one julia roberts provides number outing reunites usual gang idiots first movie good see cheadle got credit flick time around agree theres lot confused story youre paying attention need glued picture throughout noteworthy scenes include tesss roberts encounter bruce willis vincent cassels turn slow motion break dancer laser field jump end caper turns almost secondary zaniness involved none believable believable oceans eleven com

In [75]:
from sklearn.svm import LinearSVC
cv = CountVectorizer(ngram_range=(2, 2
                                  ))
text_vectorized4 = cv.fit_transform(df['lemmatized_text'])
xtrain, xtest, ytrain, ytest = train_test_split(
    text_vectorized4, df['overall_sentiment'], test_size=0.2, random_state=42)
svc = LinearSVC()
svc.fit(xtrain, ytrain)
print('accuracy score : ' + str(metrics.accuracy_score(svc.predict(xtest), ytest)))

accuracy score : 0.41
