In [1]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
import warnings
from argparse import Namespace
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


warnings.filterwarnings('ignore')

In [2]:
args = Namespace(
    train_file_path = './data/raw_data/labeledTrainData.tsv',
    test_file_path = './data/raw_data/testData.tsv'
)

In [3]:
## Load Data
train_df = pd.read_csv(args.train_file_path, delimiter='\t')
test_df = pd.read_csv(args.test_file_path, delimiter='\t')

In [4]:
## Split data into train and val data
X_train, X_val, y_train, y_val = train_test_split(train_df['review'].tolist(),train_df['sentiment'].tolist(), test_size = 0.3)

In [5]:
len(X_train), len(y_train), len(X_val), len(y_val)

(17500, 17500, 7500, 7500)

In [6]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
remove_words = string.punctuation + '0123456789'

In [7]:
def process_review(review):
    
    review = BeautifulSoup(review).get_text()    
    review = re.sub('^\w+','', review)
    tokens = tokenizer.tokenize(review)
    clean_tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens if w not in stop_words and w not in remove_words and w.isalpha()]
    
    return clean_tokens

In [8]:
def build_freq(reviews, labels, threshold = 2):
    freqs = defaultdict(int)
    
    for review, label in zip(reviews, labels):
        clean_review = process_review(review)
        for w in clean_review:
            freqs[(w,label)] += 1
            
    freqs = {w:v for w,v in freqs.items() if v>threshold}
                
    return freqs

In [9]:
def create_feature(review, freqs):
    
    # feature : [1, sum_of_pos_word_freq, sum_of_neg_word_freq]
    
    X = np.zeros(3)
    X[0] = 1
    
    for w in process_review(review):
        X[1] += freqs.get((w,1),0)
        X[2] += freqs.get((w,0),0)
    
    return X

In [10]:
## Bild Vocab
freqs = build_freq(X_train, y_train)

In [11]:
## Train Features
X_train_features = list(map(lambda x: create_feature(x, freqs),X_train))
X_train_features = np.vstack(X_train_features)
y_train_features = np.array(y_train)

## Val fetaures
X_val_features = list(map(lambda x: create_feature(x, freqs),X_val))
X_val_features = np.vstack(X_val_features)
y_val_features = np.array(y_val)

In [12]:
model = LogisticRegression()
model.fit(X_train_features,y_train_features)

print(f"Train Accuracy : {model.score(X_train_features,y_train_features)}")
print(f"Val Accuracy : {model.score(X_val_features,y_val_features)}")

Train Accuracy : 0.6746285714285715
Val Accuracy : 0.6748


## Generate Test Prediction

In [None]:
## Test fetaures
X_test_features = list(map(lambda x: create_feature(x, freqs),test_df.review.tolist()))
X_test_features = np.vstack(X_test_features)

In [None]:
pred = model.predict(X_test_features)
pred