In [1]:
import re
import nltk
import gensim
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(tokenized_text):
    return [w for w in tokenized_text if not w in set(stopwords.words('english'))]

def normalize_text(text: str) -> str:
    text = remove_punctuation(text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)
    
    return text

In [3]:
class_corpus = pd.read_csv('ClassCorpus_V2.csv')

# Feature Extraction

In [4]:
class_corpus_copy = class_corpus.copy()
class_corpus_copy['normalized_review_text'] = class_corpus_copy['MovieReview'].apply(normalize_text)
class_corpus_copy['normalized_review_tokens'] = class_corpus_copy['normalized_review_text'].apply(nltk.word_tokenize)
class_corpus_copy['normalized_review_tokens'] = class_corpus_copy['normalized_review_tokens'].apply(remove_stop_words)

In [5]:
review_tokens_corpus = class_corpus_copy['normalized_review_tokens']

tagged_documents = [TaggedDocument(review_tokens, [i]) 
                    for i, review_tokens in enumerate(review_tokens_corpus)]

model_doc2vec = Doc2Vec(tagged_documents,
                        vector_size=150,
                        window=3,
                        min_count=2)

features_doc2vec = pd.DataFrame()

for review_tokens in review_tokens_corpus:
    doc_vector = pd.DataFrame(model_doc2vec.infer_vector(review_tokens)).transpose()
    features_doc2vec = pd.concat([features_doc2vec, doc_vector], axis=0)

features_doc2vec.index = class_corpus_copy.index

In [6]:
features_doc2vec.shape

(190, 150)

In [7]:
class_corpus_copy['ReviewType'].value_counts()

Negative    95
Positive    95
Name: ReviewType, dtype: int64

In [8]:
labels = np.where(class_corpus_copy.ReviewType == 'Negative', 0, 1)

# Sentiment Analysis

In [9]:
X_train, X_test, y_train, y_test = train_test_split(features_doc2vec, labels,
                                                    test_size=0.33,
                                                    random_state=20130810)

In [10]:
model_svm = SVC()

In [11]:
model_svm.fit(X_train, y_train)

SVC()

In [12]:
y_pred = model_svm.predict(X_test)

In [13]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [14]:
accuracy_score(y_test, y_pred)

0.42857142857142855

Hyperparameter tuning

In [15]:
for C_value in [0.01, 0.1, 1, 10, 100, 1000, 10000]:
    model_svm = SVC(C=C_value)
    model_svm.fit(X_train, y_train)
    y_pred = model_svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred) 
    print(f"C: {C_value}, Accuracy: {acc}")

C: 0.01, Accuracy: 0.42857142857142855
C: 0.1, Accuracy: 0.42857142857142855
C: 1, Accuracy: 0.42857142857142855
C: 10, Accuracy: 0.4444444444444444
C: 100, Accuracy: 0.4444444444444444
C: 1000, Accuracy: 0.4603174603174603
C: 10000, Accuracy: 0.42857142857142855


In [16]:
model_svm = SVC(C=1000)
model_svm.fit(X_train, y_train)

SVC(C=1000)

In [17]:
model_svm.predict(X_test)

array([1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])