# Supervised Methods for Text Sentiment Analysis

#### Description:

This codebook covers how to use supervised learning methods for text sentiment analysis.

#### Skill level:

- Intermediate

### Import the required libraries
-------------------------

In [1]:
import os
import sys

platform_path = os.path.abspath(os.path.join(os.path.abspath(''), '../../../'))
sys.path.append(platform_path)

In [2]:
import numpy as np
import pandas as pd
import HELPERS.data_preprocessing.text_normalizer as tn
import HELPERS.machine_learning.model_evaluation as me
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

### Read data into a Pandas dataframe
-------------------------

In [3]:
df_raw = pd.read_csv(os.path.join(platform_path, 'DATA/movie_reviews.csv'))

### Check the shape and head of the dataframe
-------------------------

In [4]:
df_raw.shape

(50000, 2)

In [5]:
df_raw.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
#optional: use a subset to improve performance
df_raw = df_raw[:10000]

### Separate features from the label
-------------------------

In [7]:
X_all = np.array(df_raw['review'])
y_true_all = np.array(df_raw['sentiment'])

### Make a split between training and test sets of data
-------------------------

In [8]:
def shuffle_split_data(y_true_all, X_all, test_size):
    X_train, X_test, y_true_train, y_true_test = train_test_split(X_all, y_true_all, test_size=test_size)

    return X_train, y_true_train, X_test, y_true_test

In [9]:
X_train, y_true_train, X_test, y_true_test = shuffle_split_data(y_true_all, X_all, test_size=0.3)

### Normalize the feature data
-------------------------

In [10]:
X_train_norm = tn.normalize_corpus(X_train)
X_test_norm = tn.normalize_corpus(X_test)

### Create a vectorizor to label the count of uni-grams in each document
-------------------------

In [11]:
cv = CountVectorizer(min_df=0., max_df=1., binary=False)

X_train_norm_cv = cv.fit_transform(X_train_norm)
X_train_norm_cv = X_train_norm_cv.toarray()

In [12]:
X_train_norm_cv.shape

(7000, 55158)

In [13]:
X_test_norm_cv = cv.transform(X_test_norm)
X_test_norm_cv = X_test_norm_cv.toarray()

In [14]:
X_test_norm_cv.shape

(3000, 55158)

### Create a vectorizor to calculate the tf-idf measure in each document
-------------------------

In [15]:
tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)

X_train_norm_tv = tv.fit_transform(X_train_norm)
X_train_norm_tv = X_train_norm_tv.toarray()

In [16]:
X_train_norm_tv.shape

(7000, 55158)

In [17]:
X_test_norm_tv = tv.transform(X_test_norm)
X_test_norm_tv = X_test_norm_tv.toarray()

In [18]:
X_test_norm_tv.shape

(3000, 55158)

### Fit a logistic regression model using the count data
-------------------------

In [19]:
clf = LogisticRegression(penalty='l2', max_iter=1000)

clf.fit(X_train_norm_cv, y_true_train)

LogisticRegression(max_iter=1000)

### Generate predictions using the fitted model
-------------------------

In [20]:
y_pred_train_cv = clf.predict(X_train_norm_cv)
y_pred_test_cv = clf.predict(X_test_norm_cv)

### Check common error metrics for training and test sets of data
-------------------------

In [21]:
me.get_classification_metrics(y_true_train, y_pred_train_cv)

accuracy_score: 1.0
precision_score: 1.0
recall_score: 1.0
f1_score: 1.0


In [22]:
me.get_classification_metrics(y_true_test, y_pred_test_cv)

accuracy_score: 0.8743
precision_score: 0.8743
recall_score: 0.8743
f1_score: 0.8743


### Fit a logistic regression model using the tf-idf measure data
-------------------------

In [23]:
clf.fit(X_train_norm_tv, y_true_train)

LogisticRegression(max_iter=1000)

In [24]:
y_pred_train_tv = clf.predict(X_train_norm_tv)
y_pred_test_tv = clf.predict(X_test_norm_tv)

### Check common error metrics for training and test sets of data
-------------------------

In [25]:
me.get_classification_metrics(y_true_train, y_pred_train_tv)

accuracy_score: 0.9543
precision_score: 0.9544
recall_score: 0.9543
f1_score: 0.9543


In [26]:
me.get_classification_metrics(y_true_test, y_pred_test_tv)

accuracy_score: 0.8857
precision_score: 0.8858
recall_score: 0.8857
f1_score: 0.8856
