# Sentiment Analysis

# Supervised Learning - IMDB

In [1]:
import pandas as pd
import numpy as np

In [2]:
review = pd.read_csv('./labeledTrainData.tsv', header=0, sep="\t", quoting=3)
review.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [3]:
print(review['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [4]:
import re

review['review'] = review['review'].str.replace('<br />', ' ')

In [5]:
print(review['review'][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.  Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.  The actual feature film bit when it finally starts is only on for 2

In [6]:
review['review'] = review['review'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
print(review['review'][0])

 With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for  

In [10]:
from sklearn.model_selection import train_test_split

target_df = review['sentiment']
feature_df = review.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(feature_df, target_df, test_size=0.3, random_state=156)
X_train.shape, X_train.shape
print(X_train)
print(y_train)

                                                  review
3724    This version moved a little slow for my taste...
23599   I really enjoyed this film because I have a t...
11331   Saw this in the theater in     and fell out o...
15745   Recently I was looking for the newly issued W...
845     Escaping the life of being pimped by her fath...
...                                                  ...
6955    This is a generally nice film  with good stor...
7653    The real shame of   The Gathering   is not in...
9634    In what could have been an otherwise run of t...
6860    Excellent P O W  adventure  adapted by Eric W...
24108   This one features all the  bad  effect of Pri...

[17500 rows x 1 columns]
3724     0
23599    1
11331    1
15745    1
845      1
        ..
6955     1
7653     0
9634     0
6860     1
24108    0
Name: sentiment, Length: 17500, dtype: int64


## Text Vectorization & Logistic Regression

### 1.CountVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))), 
                     ('lr_clf', LogisticRegression(C=10))])



Pipeline(memory=None,
         steps=[('cnt_vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('lr_clf',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
               

In [20]:
pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:,1]

print('Accuracy: {0:.4f}, ROC-AUC: {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs)))



Accuracy: 0.8860, ROC-AUC: 0.9503


### 2. TF-IDF

In [21]:
pipeline = Pipeline([('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))), 
                     ('lr_clf', LogisticRegression(C=10))])

In [22]:
pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:,1]

print('Accuracy: {0:.4f}, ROC-AUC: {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs)))



Accuracy: 0.8936, ROC-AUC: 0.9598


# Unspervised Learning

## VADER (Rule-based Lexicon)

In [23]:
# import nltk
# nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading pack

[nltk_data]    |   Unzipping corpora/senseval.zip.
[nltk_data]    | Downloading package sentiwordnet to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/sentiwordnet.zip.
[nltk_data]    | Downloading package sentence_polarity to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/sentence_polarity.zip.
[nltk_data]    | Downloading package shakespeare to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/shakespeare.zip.
[nltk_data]    | Downloading package sinica_treebank to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/sinica_treebank.zip.
[nltk_data]    | Downloading package smultron to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/smultron.zip.
[nltk_data]    | Downloading package state_union to
[nltk_data]    |     /Users/jiwanhwang/nltk_data...
[nltk_data]    |   Unzipping corpora/st

True

In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(review['review'][0])
print(scores)

{'neg': 0.13, 'neu': 0.743, 'pos': 0.127, 'compound': -0.7943}


In [30]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

In [31]:
review['vader_preds'] = review['review'].apply(lambda x : vader_polarity(x, 0.1))
y_target = review['sentiment'].values
vader_preds = review['vader_preds'].values

In [36]:
def get_clf_eval(y_test, pred):
    from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred)

    f1 = f1_score(y_test, pred)
    print("Confusion Matrix")
    print(confusion)
    print("Accuracy: {0:.4f}, Precision: {1:.4f}, Recall: {2:.4f}, f1: {3:.4f}, AUC: {4:.4f}".format(accuracy, precision, recall, f1, roc_auc))

In [37]:
get_clf_eval(y_target, vader_preds)

Confusion Matrix
[[ 6736  5764]
 [ 1867 10633]]
Accuracy: 0.6948, Precision: 0.6485, Recall: 0.8506, f1: 0.7359, AUC: 0.6948
