In [1]:
import os
import pandas as pd
from textblob import *
import nltk
import numpy as np
from sklearn import tree
import openpyxl
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import *
from nltk.corpus import stopwords

data = pd.read_csv(r"../../data/SentimentAnalysis/imdb.csv", encoding="utf8")

data = data.sample(5000)







In [2]:
'''
    Natural language preprocessing

    Remove punctuation, make all words lowercase, and lemmatize
'''

from nltk.corpus import wordnet
from textblob import Word
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import string

'''
    NLTK has a model to tag words as adjectives, nouns, etc,
    but NLTK uses wordnet for lemmatization. wordnet only uses
    four possible tags, while NLTK returns tons of unique ones

    This function transforms NLTK tags to wordnet tags for lemmatization
'''
def nltk_tag_to_wordnet(tag: str) -> str:
    if tag[0] == "J":
        return wordnet.ADJ
    elif tag[0] == "V":
        return wordnet.VERB
    elif tag[0] == "N":
        return wordnet.NOUN
    elif tag[0] == "R":
        return wordnet.ADV
    else:
        return ""
    
'''
    Remove non-alphabetical characters and punctuation
'''
def keep_only_alphabetic(s: str) -> str:
    temp = "".join([" " if i in string.punctuation else i for i in s])
    return "".join([i for i in temp if (ord(i) <= 90 and ord(i) >= 65) or (ord(i) <= 122 and ord(i) >= 97) or i.isspace()])

'''
    Take a string of text, tokenize it, and return a list of lemmatized tokens
'''
def lemmatize_words(s: str) -> list[str]:
    lemmer = nltk.stem.WordNetLemmatizer()
    words = [i.lower() for i in word_tokenize(s)]       # tokenize and lowercase
    words = [i for i in words if i not in stopwords.words("english")]   # remove stopwords
    words = list(filter(lambda x: nltk_tag_to_wordnet(x[1]) !="", pos_tag(words)))  # remove invalid lemmatization words and tags
    words = [lemmer.lemmatize(i[0], nltk_tag_to_wordnet(i[1])) for i in words]  #  lemmatize words
    return words

'''
    Combine all functions above to pre-process strng
'''
def pre_process_text(text: str) -> str:
    s = keep_only_alphabetic(text)
    lemmatized = lemmatize_words(s)
    return " ".join(lemmatized)






In [3]:
data["text"] = data.apply(lambda x: keep_only_alphabetic(x["text"]), axis=1)
data

Unnamed: 0,text,sentiment
5247,For the sake of propaganda during World War II...,positive
17688,The world of the sci fi drama SOYLENT GREEN i...,positive
38185,So ya think you ve seen every Mafia movie ever...,positive
11829,Such great actors such a disappointment Marlo...,negative
13799,French horror cinema has seen something of a r...,positive
...,...,...
30172,I s a big struggle As a story that is surreal...,negative
37237,I m writing this note as a chess player as wel...,negative
28564,I LOVED this movie Not as great as First Da...,positive
3535,This satire is just really really dead on an...,positive


In [4]:
def to_class(s):
    match s:
        case "positive":
            return 1
        case "negative":
            return 0

In [5]:
data["sentiment"] = data.apply(lambda x: to_class(x["sentiment"]), axis=1)
data

Unnamed: 0,text,sentiment
5247,For the sake of propaganda during World War II...,1
17688,The world of the sci fi drama SOYLENT GREEN i...,1
38185,So ya think you ve seen every Mafia movie ever...,1
11829,Such great actors such a disappointment Marlo...,0
13799,French horror cinema has seen something of a r...,1
...,...,...
30172,I s a big struggle As a story that is surreal...,0
37237,I m writing this note as a chess player as wel...,0
28564,I LOVED this movie Not as great as First Da...,1
3535,This satire is just really really dead on an...,1


In [6]:


data["text_tb_pol"] = data.apply(lambda x: TextBlob(x["text"]).polarity, axis=1)
data["text_tb_sub"] = data.apply(lambda x: TextBlob(x["text"]).subjectivity, axis=1)





In [7]:
analyzer = SentimentIntensityAnalyzer()

data["text_vader_scores"] = data.apply(lambda x: analyzer.polarity_scores(x["text"]), axis=1)
data["text_vader_comp"] = data.apply(lambda x: x["text_vader_scores"]["compound"], axis=1)
data["text_vader_neg"] = data.apply(lambda x: x["text_vader_scores"]["neg"], axis=1)
data["text_vader_neu"] = data.apply(lambda x: x["text_vader_scores"]["neu"], axis=1)
data["text_vader_pos"] = data.apply(lambda x: x["text_vader_scores"]["pos"], axis=1)
data = data.drop(["text_vader_scores"], axis=1)





data


Unnamed: 0,text,sentiment,text_tb_pol,text_tb_sub,text_vader_comp,text_vader_neg,text_vader_neu,text_vader_pos
5247,For the sake of propaganda during World War II...,1,0.081061,0.442424,0.8837,0.086,0.806,0.109
17688,The world of the sci fi drama SOYLENT GREEN i...,1,0.070271,0.459549,0.9893,0.061,0.812,0.128
38185,So ya think you ve seen every Mafia movie ever...,1,0.300321,0.592308,0.8625,0.044,0.857,0.100
11829,Such great actors such a disappointment Marlo...,0,-0.247619,0.724921,-0.8365,0.184,0.719,0.097
13799,French horror cinema has seen something of a r...,1,0.167388,0.527629,0.9761,0.138,0.683,0.179
...,...,...,...,...,...,...,...,...
30172,I s a big struggle As a story that is surreal...,0,0.240353,0.475831,0.9924,0.029,0.759,0.211
37237,I m writing this note as a chess player as wel...,0,0.043812,0.402317,0.9559,0.083,0.813,0.104
28564,I LOVED this movie Not as great as First Da...,1,0.317500,0.628333,0.9921,0.111,0.546,0.343
3535,This satire is just really really dead on an...,1,0.093210,0.607407,0.9841,0.080,0.725,0.196


In [8]:
'''
    Add tf-idf vectorizer
'''

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import pandas as pd

def tf_idf_vectorize(df: pd.DataFrame, corpus: pd.Series, vocabulary: list[str]) -> tuple[list[str], pd.DataFrame]:
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words=stop_words, max_features=500, ngram_range=(1,3), vocabulary=vocabulary)
    features = vectorizer.fit_transform(corpus).toarray()
    names = vectorizer.get_feature_names_out()
    headers = [f"__word{i}" for i in range(len(names))]
    feature_frame = pd.DataFrame(features, columns=headers)
    #final = pd.concat([df, feature_frame], axis=1)
    return (names, feature_frame)


def get_full_vocabulary(corpus: pd.Series) -> list[str]:
    vectorizer = TfidfVectorizer(strip_accents="ascii", lowercase=True, stop_words=stopwords.words("english"), max_features=500, ngram_range=(1,3))
    vectorizer.fit_transform(corpus).toarray()
    return vectorizer.get_feature_names_out()


def add_tf_idf_vector(x: pd.DataFrame, partial_corpus: pd.Series, full_corpus: pd.Series) -> pd.DataFrame:
    temp = pd.concat((x, partial_corpus), axis=1).reset_index()
    # first, we need a vocabulary from the entire dataset
    vocabulary = get_full_vocabulary(full_corpus)

    tf_idf_vector = tf_idf_vectorize(temp, temp[partial_corpus.name], vocabulary=vocabulary)[1].reset_index()
    temp = temp.reset_index()
    temp = pd.concat((temp, tf_idf_vector), axis=1).drop([partial_corpus.name, "index"], axis=1)
    return temp



In [None]:
'''
    POLARITY + TF-IDF
'''
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import KFold

classes = data["sentiment"]
features = data[["text_tb_pol",  "text_vader_pos", "text_vader_neg", "text_vader_neu"]]



test_total = 0
run_count = 0


#lbgfs
x = features
y = classes
folds = 5
kf = KFold(n_splits=folds)

for i, (train_index, test_index) in enumerate(kf.split(x)):
    print(i)
    scaler = StandardScaler()
    print("tf_idf training")
    x_train = add_tf_idf_vector(x.iloc[train_index], data["text"].iloc[train_index], data["text"])
    x_train = scaler.fit_transform(x_train)

    print("tf_idf testing")
    x_test = add_tf_idf_vector(x.iloc[test_index], data["text"].iloc[test_index], data["text"])
    x_test = scaler.transform(x_test)
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegression(max_iter=500)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ], voting='soft')
    print("ensemble fitting")
    ensemble.fit(x_train, y_train)
    y_pred = ensemble.predict(x_test)
    test_total += accuracy_score(y_test, y_pred)
    run_count += 1

print(f"Test accuracy on {folds}-fold cross-validation: {test_total / run_count}")





0
tf_idf training
tf_idf testing
ensemble fitting
1
tf_idf training
tf_idf testing
ensemble fitting
2
tf_idf training
tf_idf testing
ensemble fitting
3
tf_idf training
tf_idf testing
ensemble fitting
4
tf_idf training
tf_idf testing
ensemble fitting
Test accuracy on 5-fold cross-validation: 0.8356


In [12]:
'''
    POLARITY ONLY
'''
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import KFold

classes = data["sentiment"]
features = data[["text_tb_pol",  "text_vader_pos", "text_vader_neg", "text_vader_neu"]]



test_total = 0
run_count = 0


#lbgfs
x = features
y = classes
folds = 5
kf = KFold(n_splits=folds)

for i, (train_index, test_index) in enumerate(kf.split(x)):
    print(i)
    scaler = StandardScaler()

    x_train = x.iloc[train_index]
    x_train = scaler.fit_transform(x_train)


    x_test = x.iloc[test_index]
    x_test = scaler.transform(x_test)
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegression(max_iter=500)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ], voting='soft')
    print("ensemble fitting")
    ensemble.fit(x_train, y_train)
    y_pred = ensemble.predict(x_test)
    test_total += accuracy_score(y_test, y_pred)
    run_count += 1

print(f"Test accuracy on {folds}-fold cross-validation: {test_total / run_count}")





0
ensemble fitting
1
ensemble fitting
2
ensemble fitting
3
ensemble fitting
4
ensemble fitting
Test accuracy on 5-fold cross-validation: 0.7780000000000001


In [13]:
'''
    TF-IDF ONLY
'''
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import KFold

classes = data["sentiment"]
features = data[["text_tb_pol"]]



test_total = 0
run_count = 0


#lbgfs
x = features
y = classes
folds = 5
kf = KFold(n_splits=folds)

for i, (train_index, test_index) in enumerate(kf.split(x)):
    print(i)
    scaler = StandardScaler()
    print("tf_idf training")
    x_train = add_tf_idf_vector(x.iloc[train_index], data["text"].iloc[train_index], data["text"]).drop("text_tb_pol", axis=1)
    x_train = scaler.fit_transform(x_train)

    print("tf_idf testing")
    x_test = add_tf_idf_vector(x.iloc[test_index], data["text"].iloc[test_index], data["text"]).drop("text_tb_pol", axis=1)
    x_test = scaler.transform(x_test)
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

    ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegression(max_iter=500)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ], voting='soft')
    print("ensemble fitting")
    ensemble.fit(x_train, y_train)
    y_pred = ensemble.predict(x_test)
    test_total += accuracy_score(y_test, y_pred)
    run_count += 1

print(f"Test accuracy on {folds}-fold cross-validation: {test_total / run_count}")




0
tf_idf training
tf_idf testing
ensemble fitting
1
tf_idf training
tf_idf testing
ensemble fitting
2
tf_idf training
tf_idf testing
ensemble fitting
3
tf_idf training
tf_idf testing
ensemble fitting
4
tf_idf training
tf_idf testing
ensemble fitting
Test accuracy on 5-fold cross-validation: 0.8242


In [14]:
import pickle


with open("IMDB_ENSEMBLE_.pkl", "wb") as file:
    pickle.dump(ensemble, file)

