In [1]:
import tqdm
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
import warnings
from argparse import Namespace
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


warnings.filterwarnings('ignore')

In [2]:
args = Namespace(
    train_file_path = './data/raw_data/labeledTrainData.tsv',
    test_file_path = './data/raw_data/testData.tsv'
)

In [3]:
## Load Data
train_df = pd.read_csv(args.train_file_path, delimiter='\t')
test_df = pd.read_csv(args.test_file_path, delimiter='\t')

In [4]:
## Split data into train and val data
X_train, X_val, y_train, y_val = train_test_split(train_df['review'].tolist(),train_df['sentiment'].tolist(), 
                                                  test_size = 0.3,
#                                                   stratify = train_df['sentiment'],
                                                  random_state = 121)

In [5]:
len(X_train), len(y_train), len(X_val), len(y_val)

(17500, 17500, 7500, 7500)

In [6]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
remove_words = string.punctuation + '0123456789'

In [7]:
def process_review(review):
    
    review = BeautifulSoup(review).get_text()    
    review = re.sub('^\w+','', review)
    tokens = tokenizer.tokenize(review)
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    clean_tokens = [w for w in tokens if w not in stop_words and w not in remove_words]
    
    return clean_tokens

In [8]:
def build_freq(reviews, labels, threshold = 2):
    freqs = defaultdict(int)
    
    Dpos = 0
    Dneg = 0
    
    for review, label in tqdm.tqdm(zip(reviews, labels),total = len(reviews)):
        clean_review = process_review(review)
        for w in clean_review:
            freqs[(w,label)] += 1
            
        if label == 0:
            Dneg += 1
        if label == 1:
            Dpos += 1
            
    freqs = {w:v for w,v in freqs.items() if v>threshold}
                
    return freqs, Dpos, Dneg

In [9]:
## Build Vocab
freqs, Dpos, Dneg = build_freq(X_train, y_train)
len(freqs), Dpos, Dneg

100%|██████████| 17500/17500 [00:58<00:00, 301.63it/s]


(51004, 8702, 8798)

In [10]:
## Calculate Probability
probs_df = pd.DataFrame()
for k,v in tqdm.tqdm(freqs.items()):
    probs_df.loc[k[0],k[1]] = v
probs_df.fillna(0, inplace = True)

100%|██████████| 51004/51004 [03:47<00:00, 224.06it/s]


In [11]:
probs_df.rename(columns = {1:'pos_freq',0:'neg_freq'}, inplace = True)

In [12]:
## Normalize probability using laplacian smoothing
total_pos_freq = probs_df['pos_freq'].sum()
total_neg_freq = probs_df['neg_freq'].sum()
vocab_size = probs_df.shape[0]

In [13]:
probs_df['pos_prob'] = probs_df['pos_freq'].apply(lambda x: (x+1)/(vocab_size+total_pos_freq))
probs_df['neg_prob'] = probs_df['neg_freq'].apply(lambda x: (x+1)/(vocab_size+total_neg_freq))

In [14]:
probs_df['lambda'] = probs_df.pos_freq/probs_df.neg_prob
probs_df['log_lambda'] = np.log(probs_df['lambda'])

In [15]:
log_prior = np.log(Dpos/Dneg)

In [16]:
probs_df['log_lambda'] = probs_df['log_lambda'].replace(float('-inf'),-100000)

In [17]:
def calculate_prediction(reviews, log_prior, probs):
    
    preds = []
    reviewVal = 0
    for review in tqdm.tqdm(reviews):
        reviewVal = 0
        for w in process_review(review):
            if w in probs.index:
                reviewVal += probs.loc[w,'log_lambda']
        reviewVal += log_prior
        
        if reviewVal>0:
            preds.append(1)
        else:
            preds.append(0)
    return preds

In [18]:
def calculate_accuracy(pred, y_train):
    
    correct = sum(np.array(pred) == np.array(y_train))
    return correct/len(pred)

In [19]:
train_preds = calculate_prediction(X_train, log_prior, probs_df)
train_accuracy = calculate_accuracy(train_preds,y_train)

100%|██████████| 17500/17500 [01:18<00:00, 223.29it/s]


In [20]:
val_preds = calculate_prediction(X_val, log_prior, probs_df)
val_accuracy = calculate_accuracy(val_preds,y_val)

100%|██████████| 7500/7500 [00:33<00:00, 223.45it/s]


In [21]:
print(f"Train accuracy : {train_accuracy}")
print(f"Val accuracy : {val_accuracy}")

Train accuracy : 0.7246857142857143
Val accuracy : 0.5958666666666667
