In [14]:
import tqdm
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
import warnings
from argparse import Namespace
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


warnings.filterwarnings('ignore')

In [15]:
args = Namespace(
    train_file_path = './data/raw_data/labeledTrainData.tsv',
    test_file_path = './data/raw_data/testData.tsv'
)

In [16]:
## Load Data
train_df = pd.read_csv(args.train_file_path, delimiter='\t')
test_df = pd.read_csv(args.test_file_path, delimiter='\t')

In [17]:
## Split data into train and val data
X_train, X_val, y_train, y_val = train_test_split(train_df['review'].tolist(),train_df['sentiment'].tolist(), test_size = 0.3)

In [18]:
len(X_train), len(y_train), len(X_val), len(y_val)

(17500, 17500, 7500, 7500)

In [19]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
remove_words = string.punctuation + '0123456789'

In [20]:
def process_review(review):
    
    review = BeautifulSoup(review).get_text()    
    review = re.sub('^\w+','', review)
    tokens = tokenizer.tokenize(review)
    clean_tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens if w not in stop_words and w not in remove_words if w.isalpha()]
    
    return clean_tokens

In [26]:
def build_freq(reviews, labels, threshold = 2):
    freqs = defaultdict(int)
    
    for review, label in tqdm.tqdm(zip(reviews, labels),total = len(reviews)):
        clean_review = process_review(review)
        for w in clean_review:
            freqs[(w,label)] += 1
            
    freqs = {w:v for w,v in freqs.items() if v>threshold}
                
    return freqs

In [27]:
## Build Vocab
freqs = build_freq(X_train, y_train)
len(freqs)

100%|██████████| 17500/17500 [01:40<00:00, 174.87it/s]


37914

In [28]:
## Calculate Probability
probs_df = pd.DataFrame()
for k,v in tqdm.tqdm(freqs.items()):
    probs_df.loc[k[0],k[1]] = v
probs_df.fillna(0, inplace = True)

100%|██████████| 37914/37914 [02:57<00:00, 214.12it/s]


In [38]:
probs_df.rename(columns = {0:'pos_freq',1:'neg_freq'}, inplace = True)

In [31]:
## Normalize probability using laplacian smoothing
total_pos_freq = probs_df['pos_freq'].sum()
total_neg_freq = probs_df['neg_freq'].sum()
vocab_size = probs_df.shape[0]

In [35]:
probs_df['pos_prob'] = probs_df[0].apply(lambda x: (x+1)/(vocab_size+total_pos_freq))
probs_df['neg_prob'] = probs_df[1].apply(lambda x: (x+1)/(vocab_size+total_neg_freq))

In [36]:
probs_df

Unnamed: 0,0,1
movie,0.017342,1.335375e-02
actually,0.001716,1.174750e-03
worse,0.000732,1.286584e-04
i,0.027639,2.275174e-02
ever,0.002141,1.710166e-03
suffered,0.000050,5.047366e-05
absolute,0.000115,1.009473e-04
it,0.005767,6.115230e-03
got,0.001418,1.138132e-03
terrible,0.000767,1.474623e-04


In [13]:
## Naive bayes
## log_likihood = logprior + log(pos/neg)
## if log_likihood > 0 then 1 else 0

In [None]:
## Train Features
X_train_features = list(map(lambda x: create_feature(x, freqs),X_train))
X_train_features = np.vstack(X_train_features)
y_train_features = np.array(y_train)

## Val fetaures
X_val_features = list(map(lambda x: create_feature(x, freqs),X_val))
X_val_features = np.vstack(X_val_features)
y_val_features = np.array(y_val)

In [None]:
model = LogisticRegression()
model.fit(X_train_features,y_train_features)

print(f"Train Accuracy : {model.score(X_train_features,y_train_features)}")
print(f"Val Accuracy : {model.score(X_val_features,y_val_features)}")

## Generate Test Prediction

In [None]:
## Test fetaures
X_test_features = list(map(lambda x: create_feature(x, freqs),test_df.review.tolist()))
X_test_features = np.vstack(X_test_features)

In [None]:
pred = model.predict(X_test_features)
pred