## Read Data

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import re
from bs4 import BeautifulSoup
import contractions
from sklearn.metrics import classification_report
from textblob import TextBlob
from nltk.tokenize import word_tokenize

In [None]:
df = pd.read_csv("data.tsv", usecols = ['review_body', 'star_rating'], sep='\t')

## Keep Reviews and Ratings

In [None]:
df['star_rating'] =  pd.to_numeric(df['star_rating'], errors='coerce')
df = df.dropna(subset = ['star_rating'])
df.loc[:,'star_rating'] = df['star_rating'].astype(int)
df.loc[:,'review_body'] = df['review_body'].astype(str)

 ## We select 20000 reviews randomly from each rating class.



In [None]:
data_set = pd.DataFrame()
for i in range(1,6):
    #data_set.append()
    temp = df[df['star_rating'] == i].sample(frac=1).reset_index(drop=True).iloc[:20000]
    #print(type(temp))
    data_set =data_set.append(temp, ignore_index=True)
#print(len(data_set))   
data_set = data_set.sample(frac=1).reset_index(drop=True)
print(data_set.head()) 
global count
count = 0
data_set.to_csv("review_sample.csv")

def get_avg_length(data):
    count = 0
    for review in data['review_body']:
        count += len(review)
        
    return count/len(data)

In [None]:
print(data_set.head(20)) 

In [None]:
from nltk.corpus import brown
nltk.download('brown')
from spellchecker import SpellChecker
spell = SpellChecker()
word_list = brown.words()
word_set = set(word_list)

def print_metrics(report):
    keys = ['1','2','3','4','5','macro avg']
    result = ""
    for key in keys:
        p = "precision:" + str(report[key]['precision'])
        r = "recall:" + str(report[key]['recall'])
        f = "f1-score:" + str(report[key]['f1-score'])
        row = " , ".join([p,r,f])
        result += key + " - " + row + "\n"
    return result

def spell_correction(x) :
    global count
    count += 1
    if count % 1000 == 0:
        print(count)
    corrected_words = []
    for word in word_tokenize(x) :
        if word not in word_set:
            new_word = spell.correction(word)
            if new_word != None:
                corrected_words.append(new_word)
            else:
                corrected_words.append(word)
        else :
            corrected_words.append(word)
    return " ".join(corrected_words)
            

# Data Cleaning



# Pre-processing

In [None]:

import time

print(get_avg_length(data_set))
#converting reviews to lower cases
data_set['review_body'] = data_set['review_body'].str.lower()
#extracting content in html tags
data_set['review_body'] = data_set['review_body'].apply(lambda x : BeautifulSoup(x).get_text())
#removing all urls in the review
data_set['review_body'] = data_set['review_body'].apply(lambda x : re.sub(r'http\S+','',x).strip())
#expanding contraction words
#data_set['review_body'] = data_set['review_body'].apply(lambda x : contractions.fix(x))
#removing non alphabetical characters
data_set['review_body'] = data_set['review_body'].apply(lambda x : " ".join(re.sub('[^a-z]+','', word) for word in word_tokenize(x)))
#removing extra spaces between words
data_set['review_body'] = data_set['review_body'].apply(lambda x : re.sub(' +',' ',x).strip())
data_set.head(20)
print(get_avg_length(data_set))

    

## remove the stop words 

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#data_set['review_body'] = data_set['review_body'].apply(lambda x : " ".join([token for token in x.split() if token not in stop_words])) 
print(get_avg_length(data_set))

## perform lemmatization  

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import time
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None
def lemmatize_words(sentence):
    words_pos_tag = nltk.pos_tag(word_tokenize(sentence))
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), words_pos_tag))
    final_words = []
    for word,tag in wordnet_tagged:
        if tag == None:
            final_words.append(word)
        else:
            final_words.append(WordNetLemmatizer().lemmatize(word,tag))
    return " ".join(final_words)
    

data_set['review_body'] = data_set['review_body'].apply(lambda x : " ".join([WordNetLemmatizer().lemmatize(token) for token in word_tokenize(x)]))
start_time = time.time()
#data_set['review_body'] = data_set['review_body'].apply(lambda x : lemmatize_words(x))
print(time.time() - start_time)


In [None]:
print(data_set.head(20))
#data_set['review_body'] = data_set['review_body'].apply(lambda x : spell_correction(x))
print(get_avg_length(data_set))

# TF-IDF Feature Extraction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df = 5).fit(data_set['review_body'])
tf_idf_transform = tf_idf_vectorizer.transform(data_set['review_body'])
normalized_vector = StandardScaler(with_mean = False).fit_transform(tf_idf_transform)
tf_idf_x_train,tf_idf_x_test,y_train,y_test = train_test_split(normalized_vector, data_set['star_rating'], test_size = 0.2)


In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
type(tf_idf_transform)

# Perceptron

In [None]:
from sklearn.linear_model import Perceptron
perceptron_model = Perceptron()
perceptron_model.fit(tf_idf_x_train, y_train)
y_test_pred_perceptron = perceptron_model.predict(tf_idf_x_test)
report = classification_report(y_test, y_test_pred_perceptron,output_dict=True)
print(print_metrics(report))

# SVM

In [None]:
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC()
svm_classifier.fit(tf_idf_x_train, y_train)
y_test_pred_svm = svm_classifier.predict(tf_idf_x_test)
report=classification_report(y_test, y_test_pred_svm,output_dict=True)
print(print_metrics(report))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(max_iter=800)
lr_classifier.fit(tf_idf_x_train, y_train)
y_test_pred_lr = lr_classifier.predict(tf_idf_x_test)
report=classification_report(y_test, y_test_pred_lr,output_dict=True)
print(print_metrics(report))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(tf_idf_x_train, y_train)
y_test_pred_nb = nb_model.predict(tf_idf_x_test)
report = classification_report(y_test, y_test_pred_nb,output_dict=True)
print(print_metrics(report))