In [None]:
! pip install session_info

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! pip install gensim==3.8.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
import re
import session_info

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
sns.set_context('talk')

In [None]:
session_info.show()

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Data

In [None]:
data_file = 'DSP453_ClassCorpus_v1.csv'

In [None]:
class_corpus = pd.read_csv(data_file)

In [None]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Doc_ID                    100 non-null    int64 
 1   DSI_Title                 100 non-null    object
 2   Text                      100 non-null    object
 3   Submission File Name      100 non-null    object
 4   Student Name              100 non-null    object
 5   Genre of Movie            100 non-null    object
 6   Review Type (pos or neg)  100 non-null    object
 7   Movie Title               100 non-null    object
dtypes: int64(1), object(7)
memory usage: 6.4+ KB


# Preprocessing

In [None]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

In [None]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in STOP_WORDS]

In [None]:
def normalize(input_text):
    '''
    Normalization involves the following steps:
    1. Remove punctuation
    2. Lower case all words
    3. Remove tags (i.e., HTML tags)
    4. Remove all special characters and digits
    '''
    text = remove_punctuation(input_text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)

    return text

In [None]:
def tokenize(text):
    '''
    Tokenization involves the following steps:
    1. Break text down to tokens (i.e., words separated by white spaces)
    2. Remove stop words from the tokens generated in step 1
    '''
    tokens = nltk.word_tokenize(text)
    tokenized_text = remove_stop_words(tokens)
    
    return tokenized_text

In [None]:
def lemmatize(tokenized_text, lemmatizer=WordNetLemmatizer()):
    '''
    Lemmatization is applied to each word in the list of normalized tokens
    (stop words are removed)
    '''
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [None]:
def join_tokens(lemmatized_tokens):
    return ' '.join(lemmatized_tokens)

In [None]:
class_corpus['normalized_review'] = class_corpus['Text'].apply(normalize)
class_corpus['tokenized_review'] = class_corpus['normalized_review'].apply(tokenize)
class_corpus['lemmatized_tokens'] = class_corpus['tokenized_review'].apply(lemmatize)
class_corpus['lemmatized_text'] = class_corpus['lemmatized_tokens'].apply(join_tokens)

In [None]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Doc_ID                    100 non-null    int64 
 1   DSI_Title                 100 non-null    object
 2   Text                      100 non-null    object
 3   Submission File Name      100 non-null    object
 4   Student Name              100 non-null    object
 5   Genre of Movie            100 non-null    object
 6   Review Type (pos or neg)  100 non-null    object
 7   Movie Title               100 non-null    object
 8   normalized_review         100 non-null    object
 9   tokenized_review          100 non-null    object
 10  lemmatized_tokens         100 non-null    object
 11  lemmatized_text           100 non-null    object
dtypes: int64(1), object(11)
memory usage: 9.5+ KB


# Feature Extraction

## Doc2Vec

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(class_corpus.lemmatized_tokens)]

In [None]:
model_doc2vec = Doc2Vec(documents,
                        vector_size=200,
                        min_count=1,
                        workers=4)



In [None]:
features_doc2vec = pd.DataFrame()

for document in documents:
    vector = pd.DataFrame(model_doc2vec.infer_vector(document.words)).transpose()
    features_doc2vec = pd.concat([features_doc2vec, vector])

In [None]:
features_doc2vec.shape

(100, 200)

# Sentiment Analysis

In [None]:
labels = class_corpus['Review Type (pos or neg)']

In [None]:
labels.value_counts()

Negative    50
Positive    50
Name: Review Type (pos or neg), dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_doc2vec, labels,
                                                    test_size=0.33,
                                                    random_state=20130810)

As an example, let us train a [support vector machine](https://scikit-learn.org/stable/modules/svm.html) with the default settings.

In [None]:
model_svm = SVC()

In [None]:
model_svm.fit(X_train, y_train)

SVC()

In [None]:
y_pred = model_svm.predict(X_test)

In [None]:
y_pred

array(['Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Positive', 'Negative', 'Positive', 'Positive',
       'Negative', 'Negative', 'Positive', 'Positive', 'Positive',
       'Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Positive', 'Positive', 'Negative', 'Positive',
       'Negative', 'Negative', 'Positive', 'Negative', 'Positive',
       'Negative', 'Negative', 'Negative'], dtype=object)

In [None]:
accuracy_score(y_test, y_pred)

0.5151515151515151

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.45      0.64      0.53        14
    Positive       0.62      0.42      0.50        19

    accuracy                           0.52        33
   macro avg       0.53      0.53      0.51        33
weighted avg       0.55      0.52      0.51        33



The best paramaters should of course be estimated through a hyperparameter tuning loop.

In [None]:
for C_value in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    
    model_svm = SVC(C=C_value)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred) 
    print(f"C: {C_value}, Accuracy: {acc}")

C: 0.01, Accuracy: 0.42424242424242425
C: 0.1, Accuracy: 0.42424242424242425
C: 1, Accuracy: 0.5151515151515151
C: 10, Accuracy: 0.5151515151515151
C: 100, Accuracy: 0.5454545454545454
C: 1000, Accuracy: 0.5757575757575758
C: 10000, Accuracy: 0.5757575757575758


The best accuracy is when `C = 1000`.


In [None]:
model_svm = SVC(C=1000)
model_svm.fit(X_train, y_train)

SVC(C=1000)

In [None]:
model_svm.predict(X_test)

array(['Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Positive', 'Positive',
       'Negative', 'Negative', 'Positive', 'Positive', 'Positive',
       'Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Positive', 'Positive', 'Negative', 'Negative',
       'Negative', 'Negative', 'Positive', 'Negative', 'Positive',
       'Negative', 'Negative', 'Negative'], dtype=object)

# Multi-class Classification

In [None]:
labels = class_corpus['Genre of Movie']

In [None]:
labels.value_counts()

Action    30
Horror    30
Comedy    20
Sci-Fi    20
Name: Genre of Movie, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_doc2vec, labels,
                                                    test_size=0.33,
                                                    stratify=labels,
                                                    random_state=20130810)

In [None]:
y_test.value_counts()

Horror    10
Action    10
Comedy     7
Sci-Fi     6
Name: Genre of Movie, dtype: int64

In [None]:
model_svm = SVC()

In [None]:
model_svm.fit(X_train, y_train)

SVC()

In [None]:
y_pred = model_svm.predict(X_test)

In [None]:
y_pred

array(['Horror', 'Horror', 'Horror', 'Action', 'Horror', 'Action',
       'Action', 'Action', 'Horror', 'Horror', 'Action', 'Horror',
       'Horror', 'Horror', 'Horror', 'Action', 'Horror', 'Action',
       'Action', 'Horror', 'Horror', 'Action', 'Action', 'Horror',
       'Action', 'Action', 'Horror', 'Horror', 'Action', 'Horror',
       'Action', 'Action', 'Action'], dtype=object)

In [None]:
accuracy_score(y_test, y_pred)

0.45454545454545453

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Action       0.38      0.60      0.46        10
      Comedy       0.00      0.00      0.00         7
      Horror       0.53      0.90      0.67        10
      Sci-Fi       0.00      0.00      0.00         6

    accuracy                           0.45        33
   macro avg       0.23      0.38      0.28        33
weighted avg       0.27      0.45      0.34        33



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hyperparameter tuning

In [None]:
for C_value in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    model_svm = SVC(C=C_value)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred) 
    print(f"C: {C_value}, Accuracy: {acc}")

C: 0.01, Accuracy: 0.45454545454545453
C: 0.1, Accuracy: 0.45454545454545453
C: 1, Accuracy: 0.45454545454545453
C: 10, Accuracy: 0.42424242424242425
C: 100, Accuracy: 0.36363636363636365
C: 1000, Accuracy: 0.2727272727272727
C: 10000, Accuracy: 0.36363636363636365


In [None]:
model_svm = SVC(C=1)
model_svm.fit(X_train, y_train)

SVC(C=1)

In [None]:
model_svm.predict(X_test)

array(['Horror', 'Horror', 'Horror', 'Action', 'Horror', 'Action',
       'Action', 'Action', 'Horror', 'Horror', 'Action', 'Horror',
       'Horror', 'Horror', 'Horror', 'Action', 'Horror', 'Action',
       'Action', 'Horror', 'Horror', 'Action', 'Action', 'Horror',
       'Action', 'Action', 'Horror', 'Horror', 'Action', 'Horror',
       'Action', 'Action', 'Action'], dtype=object)

# Conclusion

When text is an input to a supervised learning task (e.g., sentiment analysis or multi-class prediction) detailed examination of the classification metrics helps select the appropriate feature engineering process.