In [1]:
import re
import tqdm
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import pandas as pd
import numpy as np
import warnings
from argparse import Namespace
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings('ignore')

In [2]:
args = Namespace(
    train_file_path = './data/raw_data/labeledTrainData.tsv',
    test_file_path = './data/raw_data/testData.tsv'
)

In [3]:
## Load Data
train_df = pd.read_csv(args.train_file_path, delimiter='\t')
test_df = pd.read_csv(args.test_file_path, delimiter='\t')

In [4]:
## Split data into train and val data
X_train, X_val, y_train, y_val = train_test_split(train_df['review'].tolist(),train_df['sentiment'].tolist(), 
                                                  test_size = 0.3,
                                                 random_state = 121)

In [5]:
len(X_train), len(y_train), len(X_val), len(y_val)

(17500, 17500, 7500, 7500)

In [6]:
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
remove_words = string.punctuation + '0123456789'

In [7]:
def process_review(review):
    
    review = BeautifulSoup(review).get_text()    
    review = re.sub('^\w+','', review)
    review = re.sub('[.]','', review)
    tokens = tokenizer.tokenize(review)
    tokens = [lemmatizer.lemmatize(w.lower()) for w in tokens]
    clean_tokens = [w for w in tokens if w not in stop_words and w not in remove_words]
    
    return " ".join(clean_tokens)

In [8]:
clean_X_train = list(map(lambda x: process_review(x),X_train))
clean_X_val = list(map(lambda x: process_review(x),X_val))

In [9]:
## Create TF-IDF matrix on train data
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,1))
X_train_features = vectorizer.fit_transform(clean_X_train)
len(vectorizer.vocabulary_)

20788

In [10]:
model = LogisticRegression()
model.fit(X_train_features,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
print(f"Train Accuracy : {model.score(X_train_features,y_train)}")

Train Accuracy : 0.9320571428571428


In [12]:
## Convert into features
X_val_features = vectorizer.transform(clean_X_val)
print(f"Val Accuracy : {model.score(X_val_features,y_val)}")

Val Accuracy : 0.8864


## 2- gram

In [13]:
## Create TF-IDF matrix on train data
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,2))
X_train_features = vectorizer.fit_transform(clean_X_train)
len(vectorizer.vocabulary_)

66323

In [14]:
model = LogisticRegression()
model.fit(X_train_features,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
print(f"Train Accuracy : {model.score(X_train_features,y_train)}")

Train Accuracy : 0.9466857142857142


In [16]:
## Convert into features
X_val_features = vectorizer.transform(clean_X_val)
print(f"Val Accuracy : {model.score(X_val_features,y_val)}")

Val Accuracy : 0.8884
