In [1]:
from glob import glob
from random import sample, seed
import pandas as pd
import numpy as np

In [2]:
def loadTrainData(train_path, samp_size, rand_seed = 3):
    seed(rand_seed)
    
    path_vect = [train_path, "pos", "*.txt"]
    pos_path = "/".join(path_vect)
    pos_files = glob(pos_path)
    
    file_sample = sample(pos_files, samp_size)
    
    pos_train = pd.DataFrame()
    
    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        pos_train = pd.concat([pos_train, d_temp], ignore_index = True, axis = 0)
    pos_train.rename(columns = {0 : 'Text'}, inplace = True)
    
    path_vect = [train_path, "neg", "*.txt"]
    neg_path = "/".join(path_vect)
    neg_files = glob(neg_path)
    
    neg_train = pd.DataFrame()
    
    file_sample = sample(neg_files, samp_size)
    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        neg_train = pd.concat([neg_train, d_temp], ignore_index = True, axis = 0)

    neg_train.rename(columns = {0 : 'Text'}, inplace = True)
    
    x = int(round(samp_size/2, 0))
    
    return pos_train[:x], pos_train[x:], neg_train[:x], neg_train[x:]
    

def loadTestData(test_path, train_samp_size, test_ratio = 0.3, rand_seed = 4):
    seed(rand_seed)
    
    test_sample_size = round((train_samp_size * test_ratio) / (1-test_ratio), 0)
    test_sample_size = int(test_sample_size)
    
    path_vect = [test_path, "pos", "*.txt"]
    pos_path = "/".join(path_vect)
    pos_files = glob(pos_path)
    
    file_sample = sample(pos_files, test_sample_size)
    
    pos_test = pd.DataFrame()

    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        pos_test = pd.concat([pos_test, d_temp], ignore_index = True, axis = 0)

    pos_test.rename(columns = {0 : 'Text'}, inplace = True)
    pos_test['Label'] = 1
    
    path_vect = [test_path, "neg", "*.txt"]
    neg_path = "/".join(path_vect)
    neg_files = glob(neg_path)
    file_sample = sample(neg_files, test_sample_size)
    
    neg_test = pd.DataFrame()

    for i in range(len(file_sample)):
        d_temp = pd.read_csv(file_sample[i], sep = "\t", header = None)
        neg_test = pd.concat([neg_test, d_temp], ignore_index = True, axis = 0)

    neg_test.rename(columns = {0 : 'Text'}, inplace = True)
    neg_test['Label'] = -1
    
    return pd.concat([pos_test[['Text', 'Label']], neg_test[['Text', 'Label']]], ignore_index = True)

In [25]:
pos_vocab, pos_train, neg_vocab, neg_train = loadTrainData("train", 7000)
test_dat = loadTestData("test", 7000)
test_dat.tail()

Unnamed: 0,Text,Label
5995,I sat last night to see this film being played...,-1
5996,"Not having any idea what this film was about, ...",-1
5997,I can't remember when was the last time I have...,-1
5998,The dog can act...unfortunately nobody else in...,-1
5999,I didn't know what to expect when I started wa...,-1


In [26]:
import re
import string
from string import digits

# source: https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def cleanText(text):
    temp_text = REPLACE_NO_SPACE.sub("", text)
    temp_text = REPLACE_WITH_SPACE.sub(" ", temp_text)
    
    # removing all the numbers: 
    
    temp_text = re.sub("\[[digits:]]+", " ", temp_text)
    
    # removing digits glued to a word:
    
    temp_text = ''.join(i for i in temp_text if not i.isdigit())
    
    # Substituting multiple spaces with single space
    temp_text = re.sub(r'\s+', ' ', temp_text, flags=re.I)
    
    # Remove single characters from the start
    temp_text = re.sub(r'\^[a-zA-Z]\s+', ' ', temp_text)
    
    # remove all single characters
    temp_text = re.sub(r'\s+[a-zA-Z]\s+', ' ', temp_text)
    
    # finally, removing all the words < 3 letters 
    
    temp_text = re.sub(r'\b\w{1,3}\b', '', temp_text)
    
    return temp_text

In [27]:
pos_vocab["Text"] = pos_vocab["Text"].apply(cleanText)
pos_train["Text"] = pos_train["Text"].apply(cleanText)
neg_vocab["Text"] = neg_vocab["Text"].apply(cleanText)
neg_train["Text"] = neg_train["Text"].apply(cleanText)

pos_vocab.head()

Unnamed: 0,Text,1,2,3,4,5,6
0,bought this cheap from rental remnant loca...,,,,,,
1,This this underrated lost nothing power...,,,,,,
2,Ronald Colman gives terrific performance stag...,,,,,,
3,Disneys best films that enjoy watching ofte...,,,,,,
4,Samuel Fuller hardly Americas great directo...,,,,,,


In [28]:
pos_vocab["Text"] = pos_vocab["Text"].str.lower()
pos_train["Text"] = pos_train["Text"].str.lower()
neg_vocab["Text"] = neg_vocab["Text"].str.lower()
neg_train["Text"] = neg_train["Text"].str.lower()
pos_vocab.head()

Unnamed: 0,Text,1,2,3,4,5,6
0,bought this cheap from rental remnant loca...,,,,,,
1,this this underrated lost nothing power...,,,,,,
2,ronald colman gives terrific performance stag...,,,,,,
3,disneys best films that enjoy watching ofte...,,,,,,
4,samuel fuller hardly americas great directo...,,,,,,


In [29]:
from nltk.tokenize import word_tokenize

pos_vocab["Text"] = [word_tokenize(text) for text in pos_vocab["Text"]]
pos_train["Text"] = [word_tokenize(text) for text in pos_train["Text"]]
neg_vocab["Text"] = [word_tokenize(text) for text in neg_vocab["Text"]]
neg_train["Text"] = [word_tokenize(text) for text in neg_train["Text"]]

neg_train.head()

Unnamed: 0,Text,1,2,3,4,5,6,7
3500,"[picked, this, movie, hope, would, similar, hi...",,,,,,,
3501,"[kurosawa, really, blew, this, every, genius, ...",,,,,,,
3502,"[sorry, guys, already, written, opinion, this,...",,,,,,,
3503,"[this, movie, just, like, every, other, dutch,...",,,,,,,
3504,"[this, movie, should, watched, meant, flop, go...",,,,,,,


In [30]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def getLemmaOfAdjective(text):
    new_text = []
    for word in text:
        lemma = lemmatizer.lemmatize(word, pos ="a")
        new_text.append(lemma)
        
    return new_text

In [31]:
pos_vocab["Text"] = pos_vocab["Text"].apply(getLemmaOfAdjective)
pos_train["Text"] = pos_train["Text"].apply(getLemmaOfAdjective)
neg_vocab["Text"] = neg_vocab["Text"].apply(getLemmaOfAdjective)
neg_train["Text"] = neg_train["Text"].apply(getLemmaOfAdjective)

pos_vocab.head()

Unnamed: 0,Text,1,2,3,4,5,6
0,"[bought, this, cheap, from, rental, remnant, l...",,,,,,
1,"[this, this, underrated, lost, nothing, power,...",,,,,,
2,"[ronald, colman, gives, terrific, performance,...",,,,,,
3,"[disneys, best, films, that, enjoy, watching, ...",,,,,,
4,"[samuel, full, hardly, americas, great, direct...",,,,,,


In [39]:
from collections import Counter
from nltk.util import ngrams 

def nGrams(text, n = 2):
    return Counter(ngrams(text, n))

In [42]:
n = nGrams(pos_vocab.Text[0])

In [13]:
def getAdjectives(text):
    new_text = []

    for word in text:
        tag = nltk.pos_tag([word])[0][1][0].upper()
        if tag.startswith('J'): #or tag.startswith('R'):
            new_text.append(word)
        
    return new_text

In [14]:
pos_vocab["Text"] = pos_vocab["Text"].apply(getAdjectives)
pos_train["Text"] = pos_train["Text"].apply(getAdjectives)
neg_vocab["Text"] = neg_vocab["Text"].apply(getAdjectives)
neg_train["Text"] = neg_train["Text"].apply(getAdjectives)

neg_vocab.head()

Unnamed: 0,Text,1,2,3,4,5,6,7
0,"[other, cinematic, dead, alive, small, aggress...",,,,,,,
1,"[bad, bad, military, free, complex, same]",,,,,,,
2,"[other, disappointed, difficult, poor, good, c...",,,,,,,
3,"[many, basic, basic, vulnerable, original, hor...",,,,,,,
4,"[last, terrible, nuclear, true, other]",,,,,,,


In [15]:
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return np.column_stack((wordlist, wordfreq))

In [16]:
wl = [wordListToFreqDict(text) for text in pos_vocab["Text"]]

pos_vocab = pd.DataFrame(np.vstack(wl), columns = ["Word", "Count pos"])
pos_vocab["Label"] = 1
pos_vocab

Unnamed: 0,Word,Count pos,Label
0,local,1,1
1,whole,1,1
2,complex,1,1
3,moral,1,1
4,enormous,1,1
...,...,...,...
44924,great,2,1
44925,fresh,1,1
44926,wrong,1,1
44927,great,2,1


In [17]:
pos_vocab = pos_vocab.groupby(["Word"]).aggregate(np.sum).sort_values(by = "Label", ascending = False)
pos_vocab = pd.DataFrame(np.column_stack((pos_vocab.index, pos_vocab)), columns = ["Word", "Count pos"])
pos_vocab

Unnamed: 0,Word,Count pos
0,good,2799
1,great,2015
2,most,1361
3,much,1320
4,other,1319
...,...,...
1843,mrfurious,1
1844,multilateral,1
1845,multinational,1
1846,municipal,1


In [20]:
pos_vocab["Count pos"] = pos_vocab["Count pos"] /np.sum(pos_vocab["Count pos"])

pos_vocab

Unnamed: 0,Word,Count pos
0,good,0.0622983
1,great,0.0448485
2,most,0.0302922
3,much,0.0293797
4,other,0.0293574
...,...,...
1843,mrfurious,2.22573e-05
1844,multilateral,2.22573e-05
1845,multinational,2.22573e-05
1846,municipal,2.22573e-05


In [21]:
wl = [wordListToFreqDict(text) for text in neg_vocab["Text"]]

neg_vocab = pd.DataFrame(np.vstack(wl), columns = ["Word", "Count pos"])
neg_vocab["Label"] = -1
neg_vocab

Unnamed: 0,Word,Count pos,Label
0,other,1,-1
1,cinematic,1,-1
2,dead,1,-1
3,alive,1,-1
4,small,1,-1
...,...,...,...
42415,bad,2,-1
42416,typical,1,-1
42417,tough,1,-1
42418,much,1,-1


In [22]:
neg_vocab = neg_vocab.groupby(["Word"]).aggregate(np.sum).sort_values(by = "Label", ascending = True)
neg_vocab = pd.DataFrame(np.column_stack((neg_vocab.index, neg_vocab)), columns = ["Word", "Count neg"])
neg_vocab

Unnamed: 0,Word,Count neg
0,good,-3066
1,much,-1488
2,other,-1175
3,most,-1071
4,bad,-1046
...,...,...
1788,loutish,-1
1789,argentinian,-1
1790,spielbergian,-1
1791,disagreeable,-1


In [24]:
from collections import Counter
from nltk.util import ngrams 

text = "the quick person did not realize his speed and the quick person bumped "
n_gram = 3
Counter(ngrams(text.split(), n_gram))

Counter({('the', 'quick', 'person'): 2,
         ('quick', 'person', 'did'): 1,
         ('person', 'did', 'not'): 1,
         ('did', 'not', 'realize'): 1,
         ('not', 'realize', 'his'): 1,
         ('realize', 'his', 'speed'): 1,
         ('his', 'speed', 'and'): 1,
         ('speed', 'and', 'the'): 1,
         ('and', 'the', 'quick'): 1,
         ('quick', 'person', 'bumped'): 1})

In [115]:
vocab = pd.concat([pos_vocab, neg_vocab], join = "outer")
vocab = vocab.fillna(0)
p = vocab['Count pos'].astype(int)
n = vocab['Count neg'].astype(int)
dif = p+n
sums = sum(abs(dif))

vocab['Dif'] = (dif/sums)*10 
vocab = vocab[["Word", "Dif"]]
vocab

Unnamed: 0,Word,Dif
0,great,0.250896
1,good,0.250896
2,other,0.155317
3,little,0.131422
4,full,0.119474
...,...,...
188,intrusive,-0.011947
189,intuitive,-0.011947
190,laughable,-0.011947
191,lillian,-0.011947


In [116]:
l = list(pos_train.Text)

def sumOfVocabValues(text, vocab):
    word_vec = []
    for word in text:
        if word in list(vocab.Word):
            word_vec.append(word)
    
    return sum(vocab.Dif[vocab.Word.isin(word_vec)])

sumOfVocabValues(l[0], vocab)

-0.04778972520908003

In [126]:
l = list(pos_train.Text)

sent_values = [sumOfVocabValues(text, vocab) for text in l]
pos_train = pd.DataFrame(sent_values, columns = ["Value"])

pos_train["Label"] = 1
pos_train

Unnamed: 0,Value,Label
0,-0.04779,1
1,-0.298686,1
2,-0.035842,1
3,0.179211,1
4,-0.083632,1
5,0.131422,1
6,0.167264,1
7,-0.035842,1
8,0.119474,1
9,-0.143369,1


In [127]:
l = list(neg_train.Text)

sent_values = [sumOfVocabValues(text, vocab) for text in l]
neg_train = pd.DataFrame(sent_values, columns = ["Value"])

neg_train["Label"] = -1
neg_train

Unnamed: 0,Value,Label
0,-0.107527,-1
1,-0.095579,-1
2,0.023895,-1
3,-0.04779,-1
4,-0.191159,-1
5,-0.107527,-1
6,-0.167264,-1
7,-0.035842,-1
8,-0.095579,-1
9,-0.083632,-1
