#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import pickle

#### Loading the Dataset

In [2]:
con = sqlite3.connect('database.sqlite')

filtered_data = pd.read_sql_query("""
SELECT * 
FROM Reviews
WHERE Score!=3""",con)

In [3]:
actual_score = filtered_data['Score']

def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'


filtered_data['Score']=filtered_data['Score'].map(partition)

#### Sorting the Data based on Time

In [4]:
sorted_data = filtered_data.sort_values("Time",axis=0,ascending=True)

final = filtered_data.drop_duplicates(subset={'UserId','Time','ProfileName','Text'},keep='first',inplace=False)

#### Duplicate datas have been discarded

final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

#### Data Ceaning and Preprocessing

In [5]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))


stop.remove('not')


In [8]:
from nltk.stem.wordnet import WordNetLemmatizer

In [10]:
lemma =WordNetLemmatizer()

In [12]:
import re

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|%|!|+|*|@|&|^|`|~|\'|"|#|=]',r'',sentence)
    cleaned = re.sub(r'[:|;|.|)|(|,|\|/|_|-]',r' ',cleaned)
    cleaned = re.sub(r'\s+',r' ',cleaned)
    return cleaned

def cleannewline(sentance):
    cleaned = re.sub(r'\n',r' ',sentance)
    return cleaned

def cleannumbers(sentance):
    cleaned=re.sub(r'\s\d*',r' ',sentance)
    return cleaned

In [13]:
import time

In [15]:
t0 = time.clock()
i=0
str1=''
final_string = []
all_positive_words = []
all_negative_words = []
s = ''
for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent)
    sent=cleannumbers(sent)
    sent=cleannewline(sent)
    for w in sent.split(" "):
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha())&(len(cleaned_words)>2)):
                if(cleaned_words.lower() not in stop):
                    s=lemma.lemmatize(cleaned_words.lower(),'v')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i]=='positive':
                        all_positive_words.append(s)
                    if (final['Score'].values)[i]=='negative':
                        all_negative_words.append(s)
                else:
                    continue
        else:
            continue
    str1 = " ".join(filtered_sentence)
    final_string.append(str1)
    i+=1
print("="*120)
print('\n')
print("Time taken for text pre processing: ",time.clock()-t0)
print('\n')
print("="*120)



Time taken for text pre processing:  293.24802999999997




In [16]:
final_df = pd.DataFrame(final_string)

final_df['Time']=final['Time']

final_df['Score']=final['Score']

In [17]:
final_df.dropna(axis=0,inplace=True)

In [18]:
final_df['Score']=final_df['Score'].map({'positive':1,'negative':-1})

In [19]:
final_df.head(5)

Unnamed: 0,0,Time,Score
0,buy several vitality can dog food products fin...,1303862000.0,1
1,product arrive label jumbo salt peanuts peanut...,1346976000.0,-1
2,confection around centuries light pillowy citr...,1219018000.0,1
3,look secret ingredient robitussin believe find...,1307923000.0,-1
4,great taffy great price wide assortment yummy ...,1350778000.0,1


In [20]:
X=final_df[0].values
y=final_df["Score"].values

#### Splitting Processed Data into 70% Train set and 30% Test set (From top 100k Data)

In [21]:
X_1=X[0:70000]
y_1=y[0:70000]
X_test=X[70000:100000]
y_test=y[70000:100000]

#### Bag of Words Method

In [22]:
count_vect = CountVectorizer(min_df=10)
final_counts_X1 = count_vect.fit_transform(X_1)
#### Got Bag of Words for X-train data

In [23]:
final_counts_Xtest= count_vect.transform(X_test)
#### Got Bag of Words for X-train data

#### Saving bow train,test datasets

In [24]:
with open("count_vect.txt", "wb") as fp:
    pickle.dump(count_vect, fp)

In [25]:
from scipy import sparse

In [26]:
with open("final_counts_X1.txt", "wb") as fp:
    pickle.dump(final_counts_X1, fp)
with open("final_counts_Xtest.txt", "wb") as fp:
    pickle.dump(final_counts_Xtest, fp)


#### Tfidf Method

In [27]:
tfidf_vect = TfidfVectorizer(min_df=10)
final_tfidf_X1=tfidf_vect.fit_transform(X_1)

In [28]:
final_tfidf_Xtest=tfidf_vect.transform(X_test)

#### Saving bow train,test datasets

In [29]:
with open("tfidf_vect.txt", "wb") as fp:
    pickle.dump(tfidf_vect, fp)

In [30]:
sparse.save_npz('final_tfidf_X1.npz',final_tfidf_X1)
sparse.save_npz('final_tfidf_Xtest.npz',final_tfidf_Xtest)

In [31]:
with open("final_tfidf_X1.txt", "wb") as fp:
    pickle.dump(final_tfidf_X1, fp)
with open("final_tfidf_Xtest.txt", "wb") as fp:
    pickle.dump(final_tfidf_Xtest, fp)

#### Word2Vec

In [32]:
import gensim

In [74]:
i=0
list_of_sent_X1=[]
for sent in X_1:
    list_of_sent_X1.append(sent.split())

In [75]:
i=0
list_of_sent_Xtest=[]
for sent in X_test:
    list_of_sent_Xtest.append(sent.split())

In [76]:
# min_count = 5 considers only words that occured atleast 5 times
w2v_model=gensim.models.Word2Vec(list_of_sent_X1,min_count=5,size=50, workers=4)

In [40]:
w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times  15535
sample words  ['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'product', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky', 'appreciates', 'arrived', 'labeled', 'jumbo', 'salted', 'peanuts', 'actually', 'small', 'sized', 'unsalted', 'not', 'sure', 'error', 'vendor', 'intended', 'represent', 'confection', 'around', 'centuries', 'light', 'citrus', 'gelatin', 'nuts', 'case', 'cut', 'tiny', 'squares', 'liberally', 'coated', 'powdered']


#### Avg W2V, TFIDF-W2V

In [41]:
tfidf_feat = tfidf_vect.get_feature_names()

In [81]:
w2v_tfidf_X1 = []
row=0;
for sent in list_of_sent_X1: # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            if word in tfidf_feat:
                tf_idf = final_tfidf_X1[row, tfidf_feat.index(word)]
                sent_vec += (vec * tf_idf)
                weight_sum += tf_idf
            else:
                sent_vec += np.zeros(50)
    if weight_sum != 0:
        sent_vec /= weight_sum
    w2v_tfidf_X1.append(sent_vec)
    row += 1

In [82]:
w2v_tfidf_Xtest = []
row=0;
for sent in list_of_sent_Xtest: # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            if word in tfidf_feat:
                tf_idf = final_tfidf_X1[row, tfidf_feat.index(word)]
                sent_vec += (vec * tf_idf)
                weight_sum += tf_idf
            else:
                sent_vec += np.zeros(50)
    if weight_sum != 0:
        sent_vec /= weight_sum
    w2v_tfidf_Xtest.append(sent_vec)
    row += 1

In [83]:
len(w2v_tfidf_Xtest)

30000

In [44]:
w2v_avg_X1 = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent_X1: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    w2v_avg_X1.append(sent_vec)

In [45]:
w2v_avg_Xtest = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent_Xtest: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    w2v_avg_Xtest.append(sent_vec)

In [46]:
w2v_avg_X1[0]

array([ 5.86153142e-01,  1.32833720e-02,  2.99984367e-02, -5.22968030e-01,
        4.30802614e-01, -7.31527222e-02,  2.46449096e-01, -3.66886729e-01,
        9.95677706e-01,  1.11546915e-01, -5.34215889e-01, -1.91905060e-01,
       -2.86802006e-01, -7.60731974e-02, -9.19047531e-01, -7.69377659e-01,
        5.84753209e-01, -6.93941044e-01,  1.73531016e-02, -2.22409021e-01,
       -3.62268700e-01, -4.77278809e-01,  2.41755021e-01,  5.42519509e-01,
       -1.54795681e-01,  3.13964551e-04, -4.22534948e-01,  2.72421488e-01,
       -1.24373217e-01, -1.05407354e+00,  4.42179748e-02, -2.61425074e-01,
        5.56642671e-01,  1.28112638e-01, -1.02148023e+00,  3.41930488e-01,
       -6.47521118e-01, -2.21532780e-01,  7.46036242e-01,  2.52206292e-01,
        5.53603931e-01,  1.08071832e+00,  7.66530347e-01,  3.89471272e-01,
       -1.95611678e-01, -6.06795925e-01, -2.11925665e-01, -9.95652612e-02,
       -7.89889621e-01,  4.39070828e-01])

#### Saving avg_w2v and tfidf_w2v

In [47]:
import pickle

In [84]:
with open("w2v_tfidf_X1.txt", "wb") as fp:
    pickle.dump(w2v_tfidf_X1, fp)
with open("w2v_tfidf_Xtest.txt", "wb") as fp:
    pickle.dump(w2v_tfidf_Xtest, fp)
with open("w2v_avg_X1.txt", "wb") as fp:
    pickle.dump(w2v_avg_X1, fp)
with open("w2v_avg_Xtest.txt", "wb") as fp:
    pickle.dump(w2v_avg_Xtest, fp)

In [49]:
with open("w2v_model.txt", "wb") as fp:
    pickle.dump(w2v_model, fp)

In [50]:
'''
#Loading the w2v matrices

with open("w2v_tfidf_X1.txt", "rb") as fp:
    w2v_tfidf_X1 = pickle.load(fp)
with open("w2v_tfidf_Xtest.txt", "rb") as fp:
    w2v_tfidf_Xtest = pickle.load(fp)
with open("w2v_avg_X1.txt", "rb") as fp:
    w2v_avg_X1 = pickle.load(fp)
with open("w2v_avg_Xtest.txt", "rb") as fp:
    w2v_avg_Xtest = pickle.load(fp)


#loading bow matrices

final_counts_X1 = sparse.load_npz('final_counts_X1.npz')
final_counts_Xtest = sparse.load_npz('final_counts_Xtest.npz')

#loading tfidf matrices
final_tfidf_X1 = sparse.load_npz('final_tfidf_X1.npz')
final_tfidf_Xtest = sparse.load_npz('final_tfidf_Xtest.npz')

'''

'\n#Loading the w2v matrices\n\nwith open("w2v_tfidf_X1.txt", "rb") as fp:\n    w2v_tfidf_X1 = pickle.load(fp)\nwith open("w2v_tfidf_Xtest.txt", "rb") as fp:\n    w2v_tfidf_Xtest = pickle.load(fp)\nwith open("w2v_avg_X1.txt", "rb") as fp:\n    w2v_avg_X1 = pickle.load(fp)\nwith open("w2v_avg_Xtest.txt", "rb") as fp:\n    w2v_avg_Xtest = pickle.load(fp)\n\n\n#loading bow matrices\n\nfinal_counts_X1 = sparse.load_npz(\'final_counts_X1.npz\')\nfinal_counts_Xtest = sparse.load_npz(\'final_counts_Xtest.npz\')\n\n#loading tfidf matrices\nfinal_tfidf_X1 = sparse.load_npz(\'final_tfidf_X1.npz\')\nfinal_tfidf_Xtest = sparse.load_npz(\'final_tfidf_Xtest.npz\')\n\n'

#### Saving target

In [51]:
with open("y_1.txt", "wb") as fp:
    pickle.dump(y_1, fp)
with open("y_test.txt", "wb") as fp:
    pickle.dump(y_test, fp)

In [None]:
#Loading the target vectors
'''

with open("y_1.txt", "rb") as fp:
    y_1 = pickle.load(fp)
with open("y_test.txt", "rb") as fp:
    y_test = pickle.load(fp)
    
'''

In [72]:
np.sum(y_test)

20608

In [52]:
from sklearn.tree import DecisionTreeClassifier


In [53]:
DT=DecisionTreeClassifier()
DT.fit(final_counts_X1,y_1)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [55]:
from sklearn import metrics

In [60]:
metrics.recall_score(y_test,DT.predict(final_counts_Xtest))

0.8443724312361682