<a href="https://colab.research.google.com/github/danielka223/ds_workshop/blob/master/dsWorkshopAmazon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1 align="left">Helpfulness Prediction of Amazon Product Reviews</h1> 
<h2 align="left">DS Workshop - Fall 18/19 - Tel Aviv University</h2> 
<h3 align="left">By Daniel K.A, Ido Salomon, Itamar Mutzafi and Sagi Aharoni</h3> 




#Notebook Initialization
This section should only run once on a new runtime

In [17]:
#@title Install Required Packages

!pip install wordcloud
!pip install nltk



In [0]:
#@title Import Python Libraries {display-mode: "form"}

import os
import glob
import pandas as pd
import json
import datetime

from google.colab import drive

from contextlib import contextmanager
from os.path import getsize, basename
from tqdm import tqdm

import requests
import re

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

import gzip

In [19]:
#@title Connect to Drive {display-mode: "form"}

drive.mount('/content/drive/')
            

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
#@title Infra Definitions

@contextmanager
def pbopen(filename, mode='r'):
    total = getsize(filename)
    pb = tqdm(total=total, unit="B", unit_scale=True,
              desc=basename(filename), miniters=1,
              ncols=80, ascii=True)

    def wrapped_line_iterator(fd):
        processed_bytes = 0
        for line in fd:
            processed_bytes += len(line)
            # update progress every MB.
            if processed_bytes >= 1024 * 1024:
                pb.update(processed_bytes)
                processed_bytes = 0

            yield line

        # finally
        pb.update(processed_bytes)
        pb.close()

    with open(filename, mode) as fd:
        yield wrapped_line_iterator(fd)
        
#decompress input folder to output folder
def ungzip(source_dir, dest_dir):
  for src_name in glob.glob(os.path.join(source_dir, '*.gz')):
      base = os.path.basename(src_name)
      dest_name = os.path.join(dest_dir, base[:-3])
      with gzip.open(src_name, 'rb') as infile:
          with open(dest_name, 'wb') as outfile:
              for line in infile:
                  outfile.write(line)

In [0]:
#@title Data Fetching

# the following script downloads the Amazon Review Dataset into google drive

root = "drive/Team Drives/DS Workshop/data/"
compressed_path = root + "compressed/"

#uncomment to download
review_paths = [
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Wireless_v1_00.tsv.gz"
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Watches_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_DVD_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Tools_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Sports_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Software_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Shoes_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Pet_Products_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_PC_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Outdoors_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Apps_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Major_Appliances_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Luggage_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Lawn_and_Garden_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Entertainment_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Health_Personal_Care_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Grocery_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Gift_Card_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Furniture_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Music_Purchase_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_01.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Baby_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Automotive_v1_00.tsv.gz",
#                 "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Apparel_v1_00.tsv.gz",
               ]

#download dataset for each category, store in drive
for url in review_paths:
  print("starded downloading " + url)
  category = re.search('us_(.+?)_v1', url).group(1)
  file_path = compressed_path + category + ".tsv.gz"
  r = requests.get(url, allow_redirects=True)
  open(file_path, 'wb').write(r.content)
  print("finished downloading " + url)

#unzip datasets  
#ungzip(compressed_path, root)

# Data Collection and Integration


In [0]:
#@title Data Filters

include_all_categories = False #@param {type:"boolean"}
correct_bias = True #@param {type:"boolean"}
pick_category = "Digital_Ebook_Purchase"  #@param ['Apparel', 'Automotive', 'Baby', 'Beauty', 'Books', 'Books_v1_00', 'Books_v1_01', 'Camera', 'Digital_Ebook_Purchase', 'Digital_Ebook_Purchase1', 'Digital_Music_Purchase', 'Digital_Software', 'Digital_Video_Download', 'Digital_Video_Games', 'Electronics', 'Furniture', 'Gift_Card', 'Grocery', 'Health_Personal_Care', 'Home', 'Home_Entertainment', 'Home_Improvement', 'Jewelry', 'Kitchen', 'Lawn_and_Garden', 'Luggage', 'Major_Appliances', 'Mobile_Apps', 'Mobile_Electronics', 'Music', 'Musical_Instruments', 'Office_Products', 'Outdoors', 'PC', 'Personal_Care_Appliances', 'Pet_Products', 'Shoes', 'Software', 'Sports', 'Tools', 'Toys', 'Video', 'Video_DVD', 'Video_Games', 'Watches', 'Wireless']
entry_limit = 413000  #@param {type: "slider", min: 1000, max: 5000000, step:1000}
#@markdown ---
#@markdown ###Review Filters
filter_reviews_older_than = '1995-01-01'  #@param {type: "date"}

#generate ds paths
root = "drive/Team Drives/DS Workshop/data/"
file_ext = ".tsv"
ds_path = root + pick_category + file_ext

file_names = glob.glob1(root, "*" + file_ext)
all_paths = [root + filename for filename in file_names]

In [23]:
if include_all_categories:
  ds_list = []
  for path in all_paths:
    print('loading ' + path)
    cat_ds = pd.read_csv(path, 
                        delimiter='\t',
                        warn_bad_lines=True,
                        error_bad_lines=False,
                        nrows=entry_limit)
    ds_list.append(cat_ds)
    reviews = pd.concat(ds_list, axis = 0, ignore_index = True)
    
  ds_list = []
else:
  reviews = pd.read_csv(ds_path, 
                        delimiter='\t',
                        warn_bad_lines=True,
                        error_bad_lines=False,
                        nrows=entry_limit)

# reviews = reviews.apply(lambda row: isLaterThan(row['review_date'], filter_reviews_older_than))

reviews = reviews[reviews['review_date'] > filter_reviews_older_than]

# reviews['unhelpful_votes'] = reviews['total_votes'] - reviews['helpful_votes']

# add helpful column. A review is considered helpful if it was upvoted at least once
# reviews['helpful'] = reviews['helpful_votes'] > 0

# thumbs = pd.DataFrame(reviews.groupby('customer_id').mean())
# thumbs = thumbs[['helpful_votes','unhelpful_votes']]
# thumbs.columns = ['helpful_votes_avg','helpful_votes_avg']
commentCount=pd.DataFrame(reviews.groupby('customer_id').size())
commentCount.columns=['commentCount']
reviews = reviews.join(commentCount, on='customer_id')

print('Description: \n')
print(reviews.describe())
print('Data types: \n')
print(reviews.dtypes)
reviews.head()

# reviews.count()

Description: 

        customer_id  product_parent    star_rating  helpful_votes  \
count  4.129850e+05    4.129850e+05  412985.000000  412985.000000   
mean   2.867195e+07    4.946859e+08       4.304604       0.998329   
std    1.481085e+07    2.877486e+08       1.047085       4.573339   
min    1.034700e+04    3.474000e+03       1.000000       0.000000   
25%    1.507327e+07    2.416769e+08       4.000000       0.000000   
50%    2.755814e+07    4.920764e+08       5.000000       0.000000   
75%    4.283653e+07    7.456701e+08       5.000000       1.000000   
max    5.309646e+07    9.999985e+08       5.000000     648.000000   

         total_votes   commentCount  
count  412985.000000  412985.000000  
mean        1.390787       4.960386  
std         5.533420      11.363703  
min         0.000000       1.000000  
25%         0.000000       1.000000  
50%         0.000000       2.000000  
75%         1.000000       4.000000  
max       742.000000     241.000000  
Data types: 

marketp

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,commentCount
0,US,33605939,RGYFDX8QXKEIR,B007KO2MLO,328837464,Big Maria,Digital_Ebook_Purchase,4.0,0.0,0.0,N,N,Quirky,Elmore Leonard meets the cast of Sierra Madre....,2013-09-09,1
1,US,34058393,R13CBGTMNV9R8Z,B005FLODDE,764276359,The Woman Who Wasn't There: The True Story of ...,Digital_Ebook_Purchase,4.0,1.0,2.0,N,Y,The Woman Who Wasn't There,This book was very interesting. It is a true s...,2013-09-09,2
2,US,39601147,R7DRFHC0F71O0,B00EA3L35O,535606445,Mary had A Sleepy Sheep,Digital_Ebook_Purchase,5.0,0.0,0.0,N,N,This Sleepy Sheep rocks!,I had the opportunity to review Mary had a Sle...,2013-09-09,2
3,US,17351407,R27LUKEXU3KBXQ,B00BL3JV50,240053004,Starstruck,Digital_Ebook_Purchase,5.0,1.0,1.0,N,Y,Steamy and suspenseful!!!!!,What a great read! I really couldn't put this...,2013-09-09,3
4,US,10463387,R1VXTPUYMNU687,B00CXU7U80,931529805,The Complete Conan Saga,Digital_Ebook_Purchase,5.0,1.0,2.0,N,N,Barbarians,Barbarians need love too ! Short stories work...,2013-09-09,8


#Data Preperation and Cleaning

#Data Visualization and Analysis

In [0]:
# #wordcloud
# text = reviews['review_body']
# wordcloud = WordCloud(
#     width = 1500,
#     height = 1000,
#     background_color = 'white',
#     ).generate(str(text))
# fig = plt.figure(
#     figsize = (18, 12),
#     facecolor = 'w',
#     edgecolor = 'w')
# plt.imshow(wordcloud, interpolation = 'bilinear')
# plt.axis('off')
# plt.tight_layout(pad=0)
# plt.show()

# wordcloud = []

# Feature Selection and Engineering

In [25]:
def add_meta_features(dataset):
  dataset['helpful'] = ApplyHelpfulnessVector(dataset) 
  
def ApplyHelpfulnessVector(reviews):
  return reviews.apply(lambda row: isHelpful(row['total_votes'], row['helpful_votes']), axis=1)
  
def isHelpful(total_votes, helpful_votes):
  if (total_votes >=4 and helpful_votes > total_votes - helpful_votes):
      return 1
  elif (total_votes < 4):
      return 0
  else:
      return -1

add_meta_features(reviews)

# reviews[reviews['helpful'] > 0].head(10) # debug
reviews.describe() #debug

Unnamed: 0,customer_id,product_parent,star_rating,helpful_votes,total_votes,commentCount,helpful
count,412985.0,412985.0,412985.0,412985.0,412985.0,412985.0,412985.0
mean,28671950.0,494685900.0,4.304604,0.998329,1.390787,4.960386,0.041864
std,14810850.0,287748600.0,1.047085,4.573339,5.53342,11.363703,0.289311
min,10347.0,3474.0,1.0,0.0,0.0,1.0,-1.0
25%,15073270.0,241676900.0,4.0,0.0,0.0,1.0,0.0
50%,27558140.0,492076400.0,5.0,0.0,0.0,2.0,0.0
75%,42836530.0,745670100.0,5.0,1.0,1.0,4.0,0.0
max,53096460.0,999998500.0,5.0,648.0,742.0,241.0,1.0


## Bias Correction

In [26]:
print(reviews.count()[0])

df1 = reviews[reviews['helpful'] == -1]
df2 = reviews[reviews['helpful'] == 1]
if correct_bias:
  df2 = reviews[reviews['helpful'] == 1].sample(n=df1.count()[0])
reviews = pd.concat([df1,df2])

print(reviews[reviews['helpful'] == 1].count()[0])
print(reviews[reviews['helpful'] == 0].count()[0])
print(reviews[reviews['helpful'] == -1].count()[0])

412985
9001
0
9001


# Model Training

In [0]:
# # CNN for the IMDB problem
# import numpy as np
# from keras.preprocessing.text import Tokenizer
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import LSTM, Convolution1D, Flatten, Dropout
# from keras.layers.convolutional import Conv1D
# from keras.layers.convolutional import MaxPooling1D
# from keras.layers.embeddings import Embedding
# from keras.preprocessing.sequence import pad_sequences
# import re
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# import pickle
# from keras.utils import np_utils
# import pandas as pd
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold
# from keras.wrappers.scikit_learn import KerasClassifier


# data = reviews
# data = data.loc[:, ['review_body','helpful']]


# #Convert reviews text to lowercase
# data['text'] = data['review_body'].apply(lambda x: str(x).lower())
# #Remove all charactares appart from a-zA-z0-9 (for instance: punctuation)
# data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

# #overallDistribution = data.overall.value_counts().sort_index()
# #print(overallDistribution)

# #Vocabulary size - Most 2000 common words
# #Convert input text to integer sequences
# max_features = 5000
# tokenizer = Tokenizer(num_words=max_features, split=' ')
# tokenizer.fit_on_texts(data['text'].values)
# X = tokenizer.texts_to_sequences(data['text'].values)
# #cap the maximum review length at 500 words,
# #truncating reviews longer than that and
# # padding reviews shorter than that with 0 values.
# max_review_length = 500
# X = pad_sequences(X,  maxlen=max_review_length)

# # Using embedding from Keras
# embedding_vector_length = 32
# def baseline_model():
#     # create the model - Machine learning mastery
#     model = Sequential()
#     model.add(Embedding(max_features, embedding_vector_length, input_length=max_review_length))
#     model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(Flatten())
#     # model.add(Dropout(0.1))
#     model.add(Dense(250, activation='relu'))
#     # model.add(Dense(1, activation='sigmoid'))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     print(model.summary())
#     return model

# Y = data['helpful']
# # Fit the model
# epochs_num = 2
# batchSize = 128

# model = KerasClassifier(build_fn=baseline_model, epochs=epochs_num, batch_size=batchSize, verbose=0)

# nsplits = 3
# kfold = KFold(n_splits=nsplits, shuffle=True, random_state=1)
# results = cross_val_score(model, X, Y, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

#Non DL Models

In [0]:
from __future__ import print_function
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import pickle
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import nltk
import string
from sklearn.model_selection import cross_validate

nltk.download('stopwords')
nltk.download('wordnet')

data = reviews
data = data.loc[:, ['review_body','helpful']]

X = data['review_body'].astype('U')
Y = data['helpful']

print(X.size)
print(Y.size)
print(stopwords.words('english'))

def stemTokenizer(sentence):
    ret = RegexpTokenizer('[a-zA-Z0-9\']+')
    sw = set(stopwords.words('english'))
    tokens= ret.tokenize(sentence)
    ess = SnowballStemmer('english', ignore_stopwords=True)
    return [ess.stem(t) for t in tokens if t not in sw]

def lemmaTokenizer(sentence):
    ret = RegexpTokenizer('[a-zA-Z0-9\']+')
    sw = set(stopwords.words('english'))
    tokens= ret.tokenize(sentence)
    wnl = WordNetLemmatizer()

    #return [wnl.lemmatize(t) for t in tokens if t not in sw]
    return [wnl.lemmatize(t) for t in tokens]

  
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text)}  for text in posts]

class Debug(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, posts):
        print(posts)
        return self

pipeline = Pipeline([
    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for standard bag-of-words model for body
            ('body_bow', Pipeline([
                
                ('vect', CountVectorizer(tokenizer = lemmaTokenizer,
                                         
                                ngram_range=(1, 2),
                                binary = True, min_df=2, max_df=0.8)),
                        ('tfidf', TfidfTransformer()),
                
            ])),
            
            # Pipeline for pulling ad hoc features from post's body
            ('body_stats', Pipeline([
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),
        ],
        # weight components in FeatureUnion
        transformer_weights={
            'body_bow': 0.85,
            'body_stats': 0.15,
        },
    )),
    ('chi2', SelectKBest(chi2, k=500)), #5000
    # Use a SVC classifier on the combined features
    #('clf', SGDClassifier( penalty='elasticnet')),
     ('clf', MultinomialNB(fit_prior=False)),
    # ('clf', LogisticRegression()),
    #('clf', RandomForestClassifier(n_estimators=100))
])

#pipeline.fit(X, Y)

from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import confusion_matrix

def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
scoring = {'tp' : make_scorer(tp), 'tn' : make_scorer(tn),
           'fp' : make_scorer(fp), 'fn' : make_scorer(fn)}

scoring = {'acc':'accuracy',
           'f1':'f1', 
           'prec':'precision', 
           'rec':'recall',
           'roc': 'roc_auc',
           'tp' : make_scorer(tp),
           'tn' : make_scorer(tn),
           'fp' : make_scorer(fp),
           'fn' : make_scorer(fn)
          }

nsplits = 3
kfold = KFold(n_splits=nsplits, shuffle=True, random_state=1)
results = cross_validate(pipeline, X, Y, cv=kfold, scoring=scoring)
print(results)
#print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
18002
18002
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on'

##Flow Understanding


In [0]:


cnt_vectorizer = CountVectorizer(tokenizer = lemmaTokenizer,
                                         
                                ngram_range=(1, 2),
                                binary = True, min_df=2, max_df=0.8)

X1 = cnt_vectorizer.fit_transform(X)
feats = np.array(cnt_vectorizer.get_feature_names())
print(X1.toarray())  
print(X1.shape)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X1)

kbest = SelectKBest(chi2, k=5)
kbest.fit(X_train_tfidf, Y)
selected = kbest.get_support()
print(feats[np.array(selected)])

# print(X_train_tfidf)
# print(X_train_tfidf.shape)
# print(X_train_tfidf[0])

# dict={}
# for doc in X_train_tfidf:
#   x=3
# dict[44]=33
# # print(dict)
# print(X_train_tfidf[0,:][0,:])

## debug

In [0]:
conf_mat = [[sum(results['test_tn']), sum(results['test_fp'])],
            [sum(results['test_fn']), sum(results['test_tp'])]]
conf_mat = np.array(conf_mat)

from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(pipeline, X, Y, cv=kfold)
# cnf_matrix = confusion_matrix(Y, y_pred)

ones = Y[y_pred == 1]
minus = Y[y_pred == -1]

print(ones[ones == 1].size) # tp
# print(ones[ones == -1].size) # fp

print(minus[minus == -1].size) # tn
# print(minus[minus == 1].size) # fn


print(conf_mat)

# Model Evaluation

## Results

## Plot Confusion Matrix

In [0]:
import itertools
from matplotlib import pyplot as plt
import numpy as np

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()



np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(conf_mat, classes=['helpful','unhelpful'],
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(conf_mat, classes=['helpful','unhelpful'], normalize=True,
                      title='Normalized confusion matrix')

plt.show()