## Assignment Natural Language Understanding

Elisabeth Putri - 20306250

##### Sentiment analysis using logistic regression

Train a binary classifier for classifying movie reviews from IMDB database. The task is to classify them as positive or
negative.

In [96]:
# importing the libraries
import os, re
import numpy as np
import pandas as pd
import time
import random
import string
from pathlib import Path
from math import floor
from collections import Counter
from collections import defaultdict

After importing the libraries, we are going to access the files directory. First, I create list containing all the text path in imdb directory.

In [97]:
imdb = list(os.walk("imdb_dataset"))
data_dirs = []

for root, dirs, files in imdb:
    data_dirs.append([os.path.join(root, name) for name in files])
    
# data_dirs[1] is a list of negative sentiments
# data_dirs[2] is a list of positive sentiments

The datadirs is a combined list containing data from neg2 and pos2. Both of the class can be accessed by list index. Index 0 for data from neg2 and index 1 for data from pos2. The next step is giving an information about their class in each path of data.

In [98]:
negdata = data_dirs[1]
posdata = data_dirs[2]

def split(listabc, chunk_size):
    for i in range(0, len(listabc), chunk_size):
        yield listabc[i:i + chunk_size]

splittedneg = list(split(negdata, 1))
splittedpos = list(split(posdata, 1))

appendedneg = []
for n in splittedneg:
    appendedneg.append(n + [0])

#print(appendedneg)
    
appendedpos = []
for p in splittedpos:
    appendedpos.append(p + [1])

#print(appendedpos)

After each data has their own information about where their class is, now we are going to combine the data. This compilation is proposed to access each class of data by list index.

In [99]:
data_dirs = [appendedneg, appendedpos]
# data_dirs[0]
# data_dirs[1]

The data are now ready to be splitted into train and test dataset as the function below.

In [100]:
from math import floor

def train_test_split(): 
    #the following lists contain the file names in the 'neg' and 'pos' directories respectively  
    traineg = appendedneg[:floor(0.75*len(appendedneg))]
    testneg = appendedneg[floor(0.75*len(appendedneg)):]
    
    trainpos = appendedpos[:floor(0.75*len(appendedpos))]
    testpos = appendedpos[floor(0.75*len(appendedpos)):]
    
    tr = traineg + trainpos
    ts = testneg + testpos
    
    random.shuffle(tr)
    random.shuffle(ts)
    return tr, ts

tr, ts = train_test_split()
tr_size, ts_size = len(tr), len(ts)

Some negative stopwords are useful for sentiment analysis because it is able to change the meaning of the sentence. Regarding to that, the modified stopwords list are listed as below.

In [101]:
# Get rid of punctuation but keep the spaces, be careful with the stopwords
from nltk.corpus import stopwords
stopWords = sorted(list(stopwords.words('english')))

regex = re.compile(r'[a-z]*n\'*t*')
stopWords2 = [a for a in stopWords if not regex.match(a)]
stopWords2.append(' ')
#print(stopWords2)

Creating vocabulary from the training data. The defined function below is porposed to tokenize each text document (proc_text) and also build a vocab dictionary.

In [162]:
def proc_text(txt):
    tok_list = []
    a = txt.translate(str.maketrans('', '', string.punctuation))
    txt = re.split(r"\s+", a)
    for token in txt:
        if not token in stopWords2:
            tok_list.append(token.lower()) # to make analysis easier, all the letter are converged into lowercase
    return tok_list

In [186]:
def build_vocab(tr):
    # building a global vocabulary
    words = []
    for a in tr:
        i = a[0]
        with open(i, encoding='utf-8') as f:
            b = proc_text(f.read())
            words.append(b)
    
    vodoc = [] # vocabulary in each document
    for c in words:
        vodoc.append(set(c))
        
    voc = []   # merging all the subset for each documents into one list
    for sublist in vodoc:
        for item in sublist:
            voc.append(item)
    vocab = set(voc)
    
    # Creating a dictionary
    vocab_dict = {}
    for e in voc: # we use voc instead of vodoc because we are counting how many document contains each word
        vocab_dict[e] = vocab_dict.setdefault(e, 0) + 1

    return vocab_dict

After we got the vocab dictionary, we will build the tf-idf function. This function is proposed to produce unique number for each words in vocabulary.The function is used in either train and test dataset. This can be useful function to logistic regression.

In [411]:
# unique number for each words in vocabulary

def tf_idf_matrix(train, vocab_dict):
    nrow = len(train)
    ncol = len(vocab_dict)
    #update toke_dict 
    X = np.zeros((nrow, ncol), dtype=np.float32)
    y = np.zeros((nrow, 1), dtype=np.float32)
    
    # tokenizing each given doc text
    tok_list = []
    for a in train:
        i = a[0]
        with open(i, encoding='utf-8') as f:
            b = proc_text(f.read())
            tok_list.append(b)
               
    # calculating the term frequency
    term_frequency_final = []
    for idx in range( len(tok_list)):
        tok_list_per_idx = tok_list[idx]
        list_inner = [tok_list_per_idx]
        term_frequency = {}
        for o in list_inner:
            #print(o)
            aa = Counter(o)
            #print(aa)
            for k, v in aa.items():
                term_frequency[k] = v
            
    term_frequency_final.append(term_frequency)
    
    # storing term frequency
    X = pd.DataFrame(term_frequency_final)
    X = df.replace(np.nan,0)

    for indx in range(nrow):
        tok_list = []
        for token in tok_list:
            if vocab_dict[token] == 0:
                continue
    
    # computing idf-value
    vodoc = [] # vocabulary in each document
    for c in tok_list:
        vodoc.append(set(c))
        
    voc = []   # merging all the subset for each documents into one list
    for sublist in vodoc:
        for item in sublist:
            voc.append(item)
    
    bb_dict = {}
    bb = Counter(voc)
    for k, v in bb.items():
        bb_dict[k] = v

    updateidf = ([math.log(len(tr)/value) for value in bb_dict.values()])
    keys = list(bb_dict.keys())

    idf_dict = dict(zip(keys, updateidf))
    
    # computing tfidf
    tfidf_dict = {}
    for a in term_frequency_final:
        for b in a.keys():
            for c in idf_dict.keys():
                if b == c:
                    tfidf = a.get(b) * idf_dict.get(b)
                    tfidf_dict[b] = tfidf
    
    # Updating X entry
    X = pd.DataFrame(tfidf_dict)
    X = df.replace(np.nan, 0)
    
    # extracting the y vector value
    y = []
    for a in train:
        i = a[1]
        y.append(i)
        
    return X, y

After we know the tfidf of each word in the dataset given to that function, we will calculate the Logistic Regression using the function below.

In [423]:
# for training data to create the model

def apply_logit(X, yt):
    from sklearn.linear_model import LogisticRegression
    global tr
    a = build_vocab(tr)
    X, y = tf_idf_matrix(tr, a)
    ab = LogisticRegression(random_state=0, tol = 0.001, max_iter = 500).fit(X, y)
       
    ab.predict(yt)
    es = ab.get_params
    return es

In [428]:
test = ts[0:2]
aa = build_vocab(ts[0:3])
X, yt = tf_idf_matrix(test, aa)
param = apply_logit(X, yt)

ValueError: Found input variables with inconsistent numbers of samples: [3, 5364]

test the model, report accuracy, recall and precision


In [None]:
param = apply_logit(X, yt)

In [427]:
def test_model(X, y):
    tp, tn, fp, fn = 0, 0, 0, 0
    
    if yt == y :
        if y == 0:
            tp += 1
        elif y == 1:
            fp += 1
    else:
        if y == 0:
            fn += 1
        elif y == 1:
            tn += 1
                
    #use the object "param" above, this is the output after training
    acc = 0
    prec = 0
    recall = 0
    f1 = 0
    
#your code
    prec = tp/(tp+fp)
    rec = tp/(tp+fn)
    acc = (tp+tn)/(tp+tn+fp+fn)
    f1 = tp/(tp +(fp+fn)/2)
    return acc, prec, recall, f1
    