In [1]:
#! pip install gensim==3.8.3

In [2]:
import nltk
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
sns.set_context('talk')

In [4]:
nltk.download('stopwords')
nltk.download('punkt') 
nltk.download('wordnet')
nltk.download('omw-1.4') 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jensen116/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/jensen116/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
def add_movie_descriptor(data: pd.DataFrame, corpus_df: pd.DataFrame):
    """
    Adds "Movie Description" to the supplied dataframe, in the form {Genre}_{P|N}_{Movie Title}_{DocID}
    """
    review = np.where(corpus_df['Review Type (pos or neg)'] == 'Positive', 'P', 'N')
    data['Descriptor'] = corpus_df['Genre of Movie'] + '_' + corpus_df['Movie Title'] + '_' + review + '_' + corpus_df['Doc_ID'].astype(str)

def get_corpus_df(path):
    data = pd.read_csv(path, encoding="utf-8")
    add_movie_descriptor(data, data)
    sorted_data = data.sort_values(['Descriptor'])
    indexed_data = sorted_data.set_index(['Doc_ID'])
    indexed_data['Doc_ID'] = indexed_data.index
    return indexed_data
    

# Data

In [6]:
CORPUS_PATH=\
'https://raw.githubusercontent.com/djp840/MSDS_453_Public/main/MSDS453_ClassCorpus/MSDS453_QA_20220906.csv'
class_corpus  = get_corpus_df(CORPUS_PATH)

In [7]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 40 to 199
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   DSI_Title                 200 non-null    object
 1   Text                      200 non-null    object
 2   Submission File Name      200 non-null    object
 3   Student Name              200 non-null    object
 4   Genre of Movie            200 non-null    object
 5   Review Type (pos or neg)  200 non-null    object
 6   Movie Title               200 non-null    object
 7   Descriptor                200 non-null    object
 8   Doc_ID                    200 non-null    int64 
dtypes: int64(1), object(8)
memory usage: 15.6+ KB


# Preprocessing

In [8]:
STOP_WORDS = set(nltk.corpus.stopwords.words('english'))

In [9]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in STOP_WORDS]

In [10]:
def normalize(input_text):
    '''
    Normalization involves the following steps:
    1. Remove punctuation
    2. Lower case all words
    3. Remove tags (i.e., HTML tags)
    4. Remove all special characters and digits
    '''
    text = remove_punctuation(input_text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)

    return text

In [11]:
def tokenize(text):
    '''
    Tokenization involves the following steps:
    1. Break text down to tokens (i.e., words separated by white spaces)
    2. Remove stop words from the tokens generated in step 1
    '''
    tokens = nltk.word_tokenize(text)
    tokenized_text = remove_stop_words(tokens)
    
    return tokenized_text

In [12]:
def lemmatize(tokenized_text, lemmatizer=WordNetLemmatizer()):
    '''
    Lemmatization is applied to each word in the list of normalized tokens
    (stop words are removed)
    '''
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [13]:
def join_tokens(lemmatized_tokens):
    return ' '.join(lemmatized_tokens)

In [14]:
class_corpus['normalized_review'] = class_corpus['Text'].apply(normalize)
class_corpus['tokenized_review'] = class_corpus['normalized_review'].apply(tokenize)
class_corpus['lemmatized_tokens'] = class_corpus['tokenized_review'].apply(lemmatize)
class_corpus['lemmatized_text'] = class_corpus['lemmatized_tokens'].apply(join_tokens)

In [15]:
class_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 40 to 199
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   DSI_Title                 200 non-null    object
 1   Text                      200 non-null    object
 2   Submission File Name      200 non-null    object
 3   Student Name              200 non-null    object
 4   Genre of Movie            200 non-null    object
 5   Review Type (pos or neg)  200 non-null    object
 6   Movie Title               200 non-null    object
 7   Descriptor                200 non-null    object
 8   Doc_ID                    200 non-null    int64 
 9   normalized_review         200 non-null    object
 10  tokenized_review          200 non-null    object
 11  lemmatized_tokens         200 non-null    object
 12  lemmatized_text           200 non-null    object
dtypes: int64(1), object(12)
memory usage: 21.9+ KB


# Feature Extraction

## Doc2Vec

In [16]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(class_corpus.lemmatized_tokens)]

In [17]:
model_doc2vec = Doc2Vec(documents,
                        vector_size=200,
                        min_count=1,
                        workers=4)

In [18]:
features_doc2vec = pd.DataFrame()

for document in documents:
    vector = pd.DataFrame(model_doc2vec.infer_vector(document.words)).transpose()
    features_doc2vec = pd.concat([features_doc2vec, vector])

In [19]:
features_doc2vec.shape

(200, 200)

# Sentiment Analysis

In [20]:
labels = class_corpus['Review Type (pos or neg)']

In [21]:
labels.value_counts()

Negative    100
Positive    100
Name: Review Type (pos or neg), dtype: int64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(features_doc2vec, labels,
                                                    test_size=0.33,
                                                    random_state=20130810)

As an example, let us train a [support vector machine](https://scikit-learn.org/stable/modules/svm.html) with the default settings.

In [23]:
model_svm = SVC()

In [24]:
model_svm.fit(X_train, y_train)

In [25]:
y_pred = model_svm.predict(X_test)

In [26]:
y_pred

array(['Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Positive', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Positive', 'Negative',
       'Negative', 'Negative', 'Positive', 'Negative', 'Positive',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Negative', 'Negative', 'Negative', 'Positive',
       'Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Negative', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Negative', 'Negative', 'Negative', 'Positive',
       'Negative', 'Positive', 'Negative', 'Negative', 'Positive',
       'Negative', 'Positive', 'Negative', 'Negative', 'Negative',
       'Negative'], dtype=object)

In [27]:
accuracy_score(y_test, y_pred)

0.45454545454545453

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.47      0.76      0.58        33
    Positive       0.38      0.15      0.22        33

    accuracy                           0.45        66
   macro avg       0.43      0.45      0.40        66
weighted avg       0.43      0.45      0.40        66



The best paramaters should of course be estimated through a hyperparameter tuning loop.

In [29]:
for C_value in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    
    model_svm = SVC(C=C_value)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred) 
    print(f"C: {C_value}, Accuracy: {acc}")

C: 0.01, Accuracy: 0.5
C: 0.1, Accuracy: 0.5
C: 1, Accuracy: 0.45454545454545453
C: 10, Accuracy: 0.4696969696969697
C: 100, Accuracy: 0.4393939393939394
C: 1000, Accuracy: 0.45454545454545453
C: 10000, Accuracy: 0.4696969696969697


The best accuracy is when `C = 1000`.


In [30]:
model_svm = SVC(C=1000)
model_svm.fit(X_train, y_train)

In [31]:
model_svm.predict(X_test)

array(['Negative', 'Negative', 'Negative', 'Positive', 'Positive',
       'Positive', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Negative', 'Positive', 'Positive', 'Negative',
       'Negative', 'Negative', 'Negative', 'Positive', 'Positive',
       'Negative', 'Negative', 'Positive', 'Negative', 'Positive',
       'Negative', 'Negative', 'Positive', 'Negative', 'Negative',
       'Positive', 'Negative', 'Negative', 'Negative', 'Positive',
       'Negative', 'Negative', 'Positive', 'Positive', 'Negative',
       'Positive', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Negative', 'Negative', 'Negative', 'Negative',
       'Positive', 'Positive', 'Negative', 'Negative', 'Positive',
       'Positive', 'Positive', 'Negative', 'Negative', 'Positive',
       'Negative', 'Positive', 'Negative', 'Positive', 'Negative',
       'Negative'], dtype=object)

# Multi-class Classification

In [32]:
labels = class_corpus['Genre of Movie']

In [33]:
labels.value_counts()

Action    50
Comedy    50
Horror    50
Sci-Fi    50
Name: Genre of Movie, dtype: int64

In [34]:
X_train, X_test, y_train, y_test = train_test_split(features_doc2vec, labels,
                                                    test_size=0.33,
                                                    stratify=labels,
                                                    random_state=20130810)

In [35]:
y_test.value_counts()

Horror    17
Action    17
Comedy    16
Sci-Fi    16
Name: Genre of Movie, dtype: int64

In [36]:
model_svm = SVC()

In [37]:
model_svm.fit(X_train, y_train)

In [38]:
y_pred = model_svm.predict(X_test)

In [39]:
y_pred

array(['Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Sci-Fi', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Sci-Fi',
       'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy'],
      dtype=object)

In [40]:
accuracy_score(y_test, y_pred)

0.2878787878787879

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Action       1.00      0.35      0.52        17
      Comedy       0.48      0.75      0.59        16
      Horror       0.41      0.71      0.52        17
      Sci-Fi       0.67      0.25      0.36        16

    accuracy                           0.52        66
   macro avg       0.64      0.51      0.50        66
weighted avg       0.64      0.52      0.50        66



Hyperparameter tuning

In [42]:
for C_value in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    model_svm = SVC(C=C_value)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred) 
    print(f"C: {C_value}, Accuracy: {acc}")

C: 0.01, Accuracy: 0.2878787878787879
C: 0.1, Accuracy: 0.2878787878787879
C: 1, Accuracy: 0.2878787878787879
C: 10, Accuracy: 0.30303030303030304
C: 100, Accuracy: 0.3333333333333333
C: 1000, Accuracy: 0.36363636363636365
C: 10000, Accuracy: 0.5151515151515151


In [43]:
model_svm = SVC(C=1)
model_svm.fit(X_train, y_train)

In [44]:
model_svm.predict(X_test)

array(['Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Sci-Fi', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Sci-Fi',
       'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy',
       'Comedy', 'Comedy', 'Comedy', 'Sci-Fi', 'Comedy', 'Comedy'],
      dtype=object)

# Conclusion

When text is an input to a supervised learning task (e.g., sentiment analysis or multi-class prediction) detailed examination of the classification metrics helps select the appropriate feature engineering process.