# Sentiment Analysis of the IMDB reviews

## Nándor ERŐS, ADM

The data was downloaded from: 
The dataset contains 25 000 training and 25 000 test data. 

## The approach: 

- 1) Random subsample of the (training) data

- 2) Splitting them in two: the first set will the basis of the vocabulary, while the other set = training data

- 3) Text cleaning, lemmatization and collection of adjectives and adverbs. (https://www.icwsm.org/papers/3--Benamara-Cesarano-Picariello-Reforgiato-Subrahmanian.pdf)

- 4) I generated the sentiment vocabulary based on the word occurrences both in the positive and negative texts ==> converted into probability values.

- 5) Feature search in train data ==> rescaling between 0-1

- 6) Test data preparation.

- 7) Classifier based on Logistic Regression (had very similar accuracy compared to Linear SVC or Naive Bayes) 

- 8) Prediction on the test data

In [1]:
from glob import glob
from random import sample, seed
import pandas as pd
import numpy as np

In [2]:
def loadTrainData(train_path, samp_size, rand_seed = 3):
    seed(rand_seed)
    
    path_vect = [train_path, "pos", "*.txt"]
    pos_path = "/".join(path_vect)
    pos_files = glob(pos_path)
    
    file_sample = sample(pos_files, samp_size)
    
    pos_train = pd.DataFrame()
    
    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        pos_train = pd.concat([pos_train, d_temp], ignore_index = True, axis = 0)
    pos_train.rename(columns = {0 : 'Text'}, inplace = True)
    
    path_vect = [train_path, "neg", "*.txt"]
    neg_path = "/".join(path_vect)
    neg_files = glob(neg_path)
    
    neg_train = pd.DataFrame()
    
    file_sample = sample(neg_files, samp_size)
    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        neg_train = pd.concat([neg_train, d_temp], ignore_index = True, axis = 0)

    neg_train.rename(columns = {0 : 'Text'}, inplace = True)
    
    x = int(round(samp_size/2, 0))
    
    return pos_train[:x], pos_train[x:], neg_train[:x], neg_train[x:]
    

def loadTestData(test_path, train_samp_size, test_ratio = 0.3, rand_seed = 4):
    seed(rand_seed)
    
    test_sample_size = round((train_samp_size * test_ratio) / (1-test_ratio), 0)
    test_sample_size = int(test_sample_size)
    
    path_vect = [test_path, "pos", "*.txt"]
    pos_path = "/".join(path_vect)
    pos_files = glob(pos_path)
    
    file_sample = sample(pos_files, test_sample_size)
    
    pos_test = pd.DataFrame()

    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        pos_test = pd.concat([pos_test, d_temp], ignore_index = True, axis = 0)

    pos_test.rename(columns = {0 : 'Text'}, inplace = True)
    pos_test['Label'] = 1
    
    path_vect = [test_path, "neg", "*.txt"]
    neg_path = "/".join(path_vect)
    neg_files = glob(neg_path)
    file_sample = sample(neg_files, test_sample_size)
    
    neg_test = pd.DataFrame()

    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        neg_test = pd.concat([neg_test, d_temp], ignore_index = True, axis = 0)

    neg_test.rename(columns = {0 : 'Text'}, inplace = True)
    neg_test['Label'] = -1
    
    return pd.concat([pos_test[['Text', 'Label']], neg_test[['Text', 'Label']]], ignore_index = True)

In [3]:
train_size = 10000

pos_vocab, pos_train, neg_vocab, neg_train = loadTrainData("train", train_size)
test_dat = loadTestData("test", train_size)
test_dat.tail()

Unnamed: 0,Text,Label
8567,Let me start by saying I don't recall laughing...,-1
8568,"First, they ruin it with the uniquely bad anim...",-1
8569,I've read the book 'Scarlett' and was expectin...,-1
8570,"Well, I set out with a few friends to see this...",-1
8571,"O boy, was this really bad.<br /><br />I saw t...",-1


In [4]:
import re
import string
from string import digits

# source: https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def cleanText(text):
    temp_text = REPLACE_NO_SPACE.sub("", text)
    temp_text = REPLACE_WITH_SPACE.sub(" ", temp_text)
    temp_text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]_", "", temp_text)
    # removing all the numbers: 
    
    temp_text = re.sub("\[[digits:]]+", " ", temp_text)
    
    # removing digits glued to a word:
    
    temp_text = ''.join(i for i in temp_text if not i.isdigit())
    
    # Substituting multiple spaces with single space
    temp_text = re.sub(r'\s+', ' ', temp_text, flags=re.I)
    
    # Remove single characters from the start
    temp_text = re.sub(r'\^[a-zA-Z]\s+', ' ', temp_text)
    
    # remove all single characters
    temp_text = re.sub(r'\s+[a-zA-Z]\s+', ' ', temp_text)
    
    # removing all the words < 3 letters 
    temp_text = re.sub(r'\b\w{1,3}\b', '', temp_text)
    
#     temp_text = temp_text.encode("ascii", "ignore")
#     temp_text = str(temp_text, 'utf-8')
    
    return temp_text

In [5]:
pos_vocab["Text"] = pos_vocab["Text"].apply(cleanText)
pos_train["Text"] = pos_train["Text"].apply(cleanText)
neg_vocab["Text"] = neg_vocab["Text"].apply(cleanText)
neg_train["Text"] = neg_train["Text"].apply(cleanText)

pos_vocab.head()

Unnamed: 0,Text,1,2,3,4,5,6
0,bought this cheap from rental remnant loca...,,,,,,
1,This this underrated lost nothing power...,,,,,,
2,Ronald Colman gives terrific performance stag...,,,,,,
3,Disneys best films that enjoy watching ofte...,,,,,,
4,Samuel Fuller hardly Americas great directo...,,,,,,


In [6]:
pos_vocab["Text"] = pos_vocab["Text"].str.lower()
pos_train["Text"] = pos_train["Text"].str.lower()
neg_vocab["Text"] = neg_vocab["Text"].str.lower()
neg_train["Text"] = neg_train["Text"].str.lower()
pos_vocab.head()

Unnamed: 0,Text,1,2,3,4,5,6
0,bought this cheap from rental remnant loca...,,,,,,
1,this this underrated lost nothing power...,,,,,,
2,ronald colman gives terrific performance stag...,,,,,,
3,disneys best films that enjoy watching ofte...,,,,,,
4,samuel fuller hardly americas great directo...,,,,,,


In [7]:
from nltk.tokenize import word_tokenize

pos_vocab["Text"] = [word_tokenize(text) for text in pos_vocab["Text"]]
pos_train["Text"] = [word_tokenize(text) for text in pos_train["Text"]]
neg_vocab["Text"] = [word_tokenize(text) for text in neg_vocab["Text"]]
neg_train["Text"] = [word_tokenize(text) for text in neg_train["Text"]]

neg_train.head()

Unnamed: 0,Text,1
5000,"[believe, shakespeare, explained, what, just, ...",
5001,"[volleyball, genre, strangely, overlooked, mos...",
5002,"[youd, think, that, with, ingrid, bergman, war...",
5003,"[that, period, history, that, fascinating, ric...",
5004,"[remember, parents, understanding, saturday, n...",


In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def getLemmaOfAdjective(text):
    new_text = []
    for word in text:
        lemma = lemmatizer.lemmatize(word, pos ="a")
        new_text.append(lemma)
        
    return new_text

In [9]:
pos_vocab["Text"] = pos_vocab["Text"].apply(getLemmaOfAdjective)
pos_train["Text"] = pos_train["Text"].apply(getLemmaOfAdjective)
neg_vocab["Text"] = neg_vocab["Text"].apply(getLemmaOfAdjective)
neg_train["Text"] = neg_train["Text"].apply(getLemmaOfAdjective)

pos_vocab.head()

Unnamed: 0,Text,1,2,3,4,5,6
0,"[bought, this, cheap, from, rental, remnant, l...",,,,,,
1,"[this, this, underrated, lost, nothing, power,...",,,,,,
2,"[ronald, colman, gives, terrific, performance,...",,,,,,
3,"[disneys, best, films, that, enjoy, watching, ...",,,,,,
4,"[samuel, full, hardly, americas, great, direct...",,,,,,


In [10]:
def getAdjectives(text):
    new_text = []

    for word in text:
        tag = nltk.pos_tag([word])[0][1][0].upper()
        if tag.startswith('J') or tag.startswith('R'):
            new_text.append(word)
        
    return new_text

In [11]:
pos_vocab["Text"] = pos_vocab["Text"].apply(getAdjectives)
pos_train["Text"] = pos_train["Text"].apply(getAdjectives)
neg_vocab["Text"] = neg_vocab["Text"].apply(getAdjectives)
neg_train["Text"] = neg_train["Text"].apply(getAdjectives)

neg_vocab.head()

Unnamed: 0,Text,1
0,"[most, ever, idle, high, only, able, finally, ...",
1,"[really, uninspired, just, only, good, only, t...",
2,"[really, other, really, instead, pretty, proba...",
3,"[very, natural, back, very, very, very, stupid...",
4,"[really, serious, good, never, anymore, next, ...",


In [13]:
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return np.column_stack((wordlist, wordfreq))

In [14]:
wl = [wordListToFreqDict(text) for text in pos_vocab["Text"]]

pos_vocab = pd.DataFrame(np.vstack(wl), columns = ["Word", "Count pos"])
# pos_vocab["Label"] = 1
pos_vocab

Unnamed: 0,Word,Count pos
0,local,1
1,almost,1
2,never,1
3,clearly,2
4,else,1
...,...,...
130303,more,1
130304,pedestrian,1
130305,only,1
130306,sometimes,1


In [15]:
wl = [wordListToFreqDict(text) for text in neg_vocab["Text"]]

neg_vocab = pd.DataFrame(np.vstack(wl), columns = ["Word", "Count neg"])
# neg_vocab["Label"] = -1
neg_vocab

Unnamed: 0,Word,Count neg
0,most,2
1,ever,1
2,idle,1
3,high,1
4,only,2
...,...,...
128674,clearly,1
128675,wrong,1
128676,just,3
128677,much,1


In [16]:
pos_vocab["Count pos"] = pos_vocab["Count pos"].astype(int)
neg_vocab["Count neg"] = neg_vocab["Count neg"].astype(int)

In [17]:
df = pd.concat([pos_vocab, neg_vocab], join = "outer").fillna(0)
df

Unnamed: 0,Word,Count pos,Count neg
0,local,1.0,0.0
1,almost,1.0,0.0
2,never,1.0,0.0
3,clearly,2.0,0.0
4,else,1.0,0.0
...,...,...,...
128674,clearly,0.0,1.0
128675,wrong,0.0,1.0
128676,just,0.0,3.0
128677,much,0.0,1.0


In [18]:
vocab = df.groupby(
   "Word"
).agg(
    {
         'Count pos':sum,    # Sum duration per group
         'Count neg': sum
    }
)
vocab

Unnamed: 0_level_0,Count pos,Count neg
Word,Unnamed: 1_level_1,Unnamed: 2_level_1
+beautiful,1.0,0.0
aadmittedly,0.0,1.0
aatish,5.0,0.0
able,380.0,257.0
ably,12.0,3.0
...,...,...
zealous,2.0,1.0
bubble,16.0,0.0
cartoonish,1.0,0.0
even,0.0,1.0


In [19]:
column_list = list(vocab)

sums = vocab[column_list].sum(axis=1)
vocab["Count pos"] = vocab["Count pos"]/ sums
vocab["Count neg"] = vocab["Count neg"]/ sums
vocab

Unnamed: 0_level_0,Count pos,Count neg
Word,Unnamed: 1_level_1,Unnamed: 2_level_1
+beautiful,1.000000,0.000000
aadmittedly,0.000000,1.000000
aatish,1.000000,0.000000
able,0.596546,0.403454
ably,0.800000,0.200000
...,...,...
zealous,0.666667,0.333333
bubble,1.000000,0.000000
cartoonish,1.000000,0.000000
even,0.000000,1.000000


**There are some weird characters in the box above that I couldn’t remove in any way.**

In [20]:
vocab.to_csv("SentimentVocabulary.csv")

In [21]:
def getWords_Labels(data, vocab):
#     data = data.iloc[0:10]
    
    labels = data.Label
    i = 0
    
    ylab = []
    
    result = np.array((None, None))
    
    for text in data.Text:
        if (len(data.Text[i]) == 0):
            i += 1
            continue
        
        wl = wordListToFreqDict(text)
        temp_df = pd.DataFrame(np.vstack(wl), columns = ["Word", "Count"])
        
        temp_df["Count"] = temp_df["Count"].astype(int)
        
        temp_df = pd.DataFrame(temp_df.groupby("Word").sum().index, columns = ["Word"])
#         print(temp_df)
#         temp_df["Label"] = labels[i]
        
        

        common_words = []

        for word in temp_df.Word:
            if word in (vocab.index):
                common_words.append(word)

        temp_df = vocab[vocab.index.isin(common_words)]
#         temp_df["Label"] = labels[i]
        s = np.sum(temp_df["Count pos"]) + np.sum(temp_df["Count neg"])

        test_1_x = [np.sum(temp_df["Count pos"]) / s, np.sum(temp_df["Count neg"]) / s]
        test_1_x = np.array(test_1_x)
        
        result = np.vstack((result, test_1_x))
        ylab.append(labels[i])
    
        i += 1
        
    return result[1:], ylab

In [22]:
pos_train.reset_index(drop=True, inplace=True)
pos_train

Unnamed: 0,Text,1,2,3,4,5,6
0,"[there, also, well, much, back, classic, most,...",,,,,,
1,"[only, progressively, mysterious, black, only,...",,,,,,
2,"[much, german, more, effective, very, differen...",,,,,,
3,"[indian, other, several, typically, entirely, ...",,,,,,
4,"[always, huge, fanatic, almost, enough, finall...",,,,,,
...,...,...,...,...,...,...,...
4995,"[great, also, great, true, least, ready, emoti...",,,,,,
4996,"[most, bible, basically, here, psychological, ...",,,,,,
4997,"[well, long, very, long, actually, recently, t...",,,,,,
4998,"[early, biographic, more, also, early, young, ...",,,,,,


In [23]:
neg_train.reset_index(drop=True, inplace=True)
neg_train

Unnamed: 0,Text,1
0,"[just, beautifully, much, whole, same, bible, ...",
1,"[strangely, most, thankfully, highly, second, ...",
2,"[good, sadly, difficult, well, major, very, mo...",
3,"[rich, barely, probably, most, never, young, c...",
4,"[live, also, many, other, still, many, patheti...",
...,...,...
4995,"[responsible, inconsequential, extremely, poor...",
4996,"[beautifully, ably, generally, very, there, su...",
4997,"[central, fictional, satirical, other, also, s...",
4998,"[complete, good, once, same, again, never, eve...",


In [24]:
pos_train["Label"] = 1
neg_train["Label"] = -1

In [25]:
train_x_1, train_y_1 = getWords_Labels(pos_train, vocab)
train_x_2, train_y_2 = getWords_Labels(neg_train, vocab)

X_train = np.vstack((train_x_1, train_x_2))
Y_train = np.hstack((train_y_1, train_y_2))
# X_train, Y_train

In [26]:
test_dat

Unnamed: 0,Text,Label
0,I cannot understand why this 1971 Hollywood pr...,1
1,Obsessed!!!!! I have every season of Gilmore G...,1
2,"i must say that this movie had a great cast, l...",1
3,"As Roger Corman has said in an interview, low-...",1
4,There is a lot wrong with this film. I will no...,1
...,...,...
8567,Let me start by saying I don't recall laughing...,-1
8568,"First, they ruin it with the uniquely bad anim...",-1
8569,I've read the book 'Scarlett' and was expectin...,-1
8570,"Well, I set out with a few friends to see this...",-1


In [27]:
test_dat["Text"] = test_dat["Text"].apply(cleanText)
test_dat["Text"] = test_dat["Text"].str.lower()
test_dat["Text"] = [word_tokenize(text) for text in test_dat["Text"]]

test_dat["Text"] = test_dat["Text"].apply(getLemmaOfAdjective)

test_dat["Text"] = test_dat["Text"].apply(getAdjectives)

In [28]:
test_dat

Unnamed: 0,Text,Label
0,"[not, currently, only, available, australian, ...",1
1,"[much, average, there, just, dramatic, never, ...",1
2,"[great, great, very, very, uproarish, serious,...",1
3,"[well, hard, mightily, back, insatiable, unwar...",1
4,"[there, wrong, most, very, very, same, unrelat...",1
...,...,...
8567,"[once, feeble, neurotic, completely, unable, s...",-1
8568,"[first, uniquely, then, original, many, then, ...",-1
8569,"[good, first, disappointed, many, different, w...",-1
8570,"[well, good, probably, soon, complete, just, h...",-1


In [29]:
X_test, Y_test = getWords_Labels(test_dat, vocab)

# Modelling

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

clf = LogisticRegression(random_state=0).fit(X_train, Y_train)

In [31]:
from sklearn import metrics

metrics.f1_score(Y_test, clf.predict(X_test))

0.7868928654427145