In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
%matplotlib inline




In [2]:
amazon = pd.read_csv('amazon_5_percent.csv')

## Processing the word once more

In [7]:
amazon['Title'] = amazon['Title'].astype('str')
amazon['Review'] = amazon['Review'].astype('str')
amazon['Title_length'] = amazon['Title'].apply(lambda x: len(x))
amazon['Review_length'] = amazon['Review'].apply(lambda x: len(x))

In [8]:
title_word = []
review_word = []

def all_word(word, list_basket):
    letters_only = re.sub("[^a-zA-Z]", " ", word)
    words = letters_only.lower().split()
    list_basket.extend(words)
    return (" ".join(words))

amazon['Title_no_punc'] = amazon['Title'].apply(lambda x: all_word(x, title_word))
amazon['Review_no_punc'] = amazon['Review'].apply(lambda x: all_word(x, review_word))      

In [9]:
amazon['Title_sentence_wo_punc'] = amazon['Title'].apply(lambda x: len(x.split()))
amazon['Review_sentence_wo_punc'] = amazon['Review'].apply(lambda x: len(x.split()))

In [26]:
stops = list(stopwords.words("english"))
title_word = []
review_word = []
def meaningful_word_specific(word, list_basket):
    words = word.split()   
    meaningful_words = [w for w in words if not w in stops] 
    list_basket.extend(meaningful_words)
    return( " ".join( meaningful_words )) 

In [27]:
amazon['Title_meaningful'] = amazon['Title_no_punc'].apply(lambda x: meaningful_word_specific(x, title_word))
amazon['Review_meaningful'] = amazon['Review_no_punc'].apply(lambda x: meaningful_word_specific(x, review_word))

In [57]:
stops.extend(['book', 'product', 'movie', 'music', 'album', 'cd'])

def meaningful_word_specific(word):
    words = word.split()   
    meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( meaningful_words )) 

amazon['Title_meaningful'] = amazon['Title_meaningful'].apply(lambda x: meaningful_word_specific(x))
amazon['Review_meaningful'] = amazon['Review_meaningful'].apply(lambda x: meaningful_word_specific(x))

In [58]:
amazon['Combination_meaningful'] = amazon[['Title_meaningful',
                                'Review_meaningful']].apply(lambda x:(x['Title_meaningful'] +' '+ x['Review_meaningful']) , axis = 1)
amazon['Combination_no_punc'] = amazon[['Title_no_punc',
                                'Review_no_punc']].apply(lambda x:(x['Title_no_punc'] +' '+ x['Review_no_punc']) , axis = 1)
amazon['Combination_all'] = amazon[['Title',
                                'Review']].apply(lambda x:(x['Title'] +' '+ x['Review']) , axis = 1)

In [59]:
def sentiment(rating):
    if (rating == 1) or (rating == 2):
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

In [60]:
amazon['Sentiment'] = amazon['Rating'].apply(sentiment)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=182498, style=ProgressStyle(description_wi…




For Machine Learning analysis, the class for some model need to be in the form of Integer type

In [72]:
def change_sentiment(word):
    if word == 'Positive':
        return 2
    elif word == 'Neutral':
        return 1
    else:
        return 0
    
amazon['Sentiment'] = amazon['Sentiment'].apply(change_sentiment)

## Predict multi class (Positive, Neutral, Negative)

## Bag-of-Word Model, Classified by various model

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, log_loss
from mord import LogisticAT

In [74]:
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(amazon['Title_meaningful'],
                                                                             amazon['Sentiment'], test_size = 0.2)
X_train_review, X_test_review, y_train_review, y_test_review = train_test_split(amazon['Review_meaningful'],
                                                                             amazon['Sentiment'], test_size = 0.2)
X_train_combination, X_test_combination, y_train_combination, y_test_combination = train_test_split(amazon['Combination_meaningful'],
                                                                             amazon['Sentiment'], test_size = 0.2)

## Testing the multiclass classifier model

In [69]:
model_1vR = LogisticRegression(multi_class='ovr', class_weight='balanced')
model_multi = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced')
model_ordinal = LogisticAT(alpha=0)
model_rfc = RandomForestClassifier(n_estimators = 100)
model_NB = MultinomialNB()

model_list = [model_1vR, model_multi, model_ordinal, model_rfc, model_NB]

In [70]:
def calc_train_error(X_train, y_train, model):
    predictions = model.predict(X_train)
    predictProba = model.predict_proba(X_train)
    f1 = f1_score(y_train, predictions, average ='macro')
    report = classification_report(y_train, predictions)
    accuracy = accuracy_score(y_train, predictions)
    confMatrix = confusion_matrix(y_train, predictions)
    logloss = log_loss(y_train,predictProba)
    return{
        'report' : report, 
        'f1' : f1,
        'accuracy': accuracy,
        'confusion': confMatrix,
        'logloss' : logloss
    }
def calc_validation_error(X_test, y_test, model):
    predictions = model.predict(X_test)
    predictProba = model.predict_proba(X_test)
    f1 = f1_score(y_test, predictions, average ='macro')
    report = classification_report(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    confMatrix = confusion_matrix(y_test, predictions)
    logloss = log_loss(y_test,predictProba)
    return{
        'report' : report, 
        'f1' : f1,
        'accuracy': accuracy,
        'confusion': confMatrix,
        'logloss' : logloss
    }
def calc_metrics(X_train, y_train, X_test, y_test, model):
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

## Testing various gram scenario with various feature

In [75]:
gram_no_tfidf_title = []
gram_tfidf_title = []

for num in range(1, 4):    
    def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =10000, ngram_range = (1, num))
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    gram_no_tfidf_title.append(pipeline_no_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))
    
for num in range(1, 4):    
    def pipeline_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features = 10000, ngram_range = (1, num))
        tfidf_transformer = TfidfTransformer()
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        X_train = tfidf_transformer.fit_transform(X_train)
        X_test = tfidf_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    gram_tfidf_title.append(pipeline_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))




In [76]:
gram_no_tfidf_review = []
gram_tfidf_review = []

for num in range(1, 4):    
    def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =10000, ngram_range = (1, num))
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    gram_no_tfidf_review.append(pipeline_no_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))
    
for num in range(1, 4):    
    def pipeline_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =10000, ngram_range = (1, num))
        tfidf_transformer = TfidfTransformer()
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        X_train = tfidf_transformer.fit_transform(X_train)
        X_test = tfidf_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    gram_tfidf_review.append(pipeline_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))




In [77]:
gram_no_tfidf_combination = []
gram_tfidf_combination = []

for num in range(1, 4):    
    def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =10000, ngram_range = (1, num))
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    gram_no_tfidf_combination.append(pipeline_no_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))
    
for num in range(1, 4):    
    def pipeline_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =10000, ngram_range = (1, num))
        tfidf_transformer = TfidfTransformer()
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        X_train = tfidf_transformer.fit_transform(X_train)
        X_test = tfidf_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    gram_tfidf_combination.append(pipeline_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))
    




In [78]:
import pickle
pickle.dump(gram_no_tfidf_review, open('gram_no_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_review, open('gram_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_no_tfidf_title, open('gram_no_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_title, open('gram_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_no_tfidf_combination, open('gram_no_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_combination, open('gram_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))

## Testing various max feature scenario

In [79]:
feature_no_tfidf_title = []
feature_tfidf_title = []

for num in [10000, 15000, 20000, 25000, 30000]:
    def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features = num, ngram_range = (1, 2))
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    feature_no_tfidf_title.append(pipeline_no_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))
    
for num in [10000, 15000,20000, 25000, 30000]:  
    def pipeline_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =num, ngram_range = (1, 2))
        tfidf_transformer = TfidfTransformer()
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        X_train = tfidf_transformer.fit_transform(X_train)
        X_test = tfidf_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    feature_tfidf_title.append(pipeline_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))




In [80]:
feature_no_tfidf_review = []
feature_tfidf_review = []

for num in [10000, 15000, 20000, 25000, 30000]:
    def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features = num, ngram_range = (1, 2))
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    feature_no_tfidf_review.append(pipeline_no_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))
    
for num in [10000, 15000,20000, 25000, 30000]:  
    def pipeline_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =num, ngram_range = (1, 2))
        tfidf_transformer = TfidfTransformer()
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        X_train = tfidf_transformer.fit_transform(X_train)
        X_test = tfidf_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    feature_tfidf_review.append(pipeline_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))




In [81]:
feature_no_tfidf_combination = []
feature_tfidf_combination = []

for num in [10000, 15000, 20000, 25000, 30000]:
    def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features = num, ngram_range = (1, 2))
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    feature_no_tfidf_combination.append(pipeline_no_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))
    
for num in [10000, 15000,20000, 25000, 30000]:  
    def pipeline_tfidf(X_train, X_test, y_train, y_test):
        train_errors = []
        test_errors = []
        bow_transformer = CountVectorizer(analyzer='word', max_features =num, ngram_range = (1, 2))
        tfidf_transformer = TfidfTransformer()
        X_train = bow_transformer.fit_transform(X_train)
        X_test = bow_transformer.transform(X_test)
        X_train = tfidf_transformer.fit_transform(X_train)
        X_test = tfidf_transformer.transform(X_test)
        for i in model_list:
            train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
            train_errors.append(train_error)
            test_errors.append(test_error)
            error = {'train_error' : train_errors, 'test_error': test_errors}
        return error
    feature_tfidf_combination.append(pipeline_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))




In [82]:
import pickle
pickle.dump(feature_no_tfidf_review, open('feature_no_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(feature_tfidf_review, open('feature_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(feature_no_tfidf_title, open('feature_no_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(feature_tfidf_title, open('feature_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(feature_no_tfidf_combination, open('feature_no_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(feature_tfidf_combination, open('feature_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))

In [83]:
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(amazon['Title_no_punc'],
                                                                             amazon['Sentiment'], test_size = 0.2)
X_train_review, X_test_review, y_train_review, y_test_review = train_test_split(amazon['Review_no_punc'],
                                                                             amazon['Sentiment'], test_size = 0.2)
X_train_combination, X_test_combination, y_train_combination, y_test_combination = train_test_split(amazon['Combination_no_punc'],
                                                                             amazon['Sentiment'], test_size = 0.2)

In [84]:
gram_no_tfidf_title = []
gram_tfidf_title = []

 
def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features =15000, ngram_range = (1, 2))
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_no_tfidf_title.append(pipeline_no_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))
    
 
def pipeline_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features = 15000, ngram_range = (1, 2))
    tfidf_transformer = TfidfTransformer()
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    X_train = tfidf_transformer.fit_transform(X_train)
    X_test = tfidf_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_tfidf_title.append(pipeline_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))




In [85]:
gram_no_tfidf_review = []
gram_tfidf_review = []

 
def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features =15000, ngram_range = (1, 2))
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_no_tfidf_review.append(pipeline_no_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))
    
 
def pipeline_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features = 15000, ngram_range = (1, 2))
    tfidf_transformer = TfidfTransformer()
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    X_train = tfidf_transformer.fit_transform(X_train)
    X_test = tfidf_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_tfidf_review.append(pipeline_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))




In [86]:
gram_no_tfidf_combination = []
gram_tfidf_combination = []

 
def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features =15000, ngram_range = (1, 2))
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_no_tfidf_combination.append(pipeline_no_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))
    
 
def pipeline_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features = 15000, ngram_range = (1, 2))
    tfidf_transformer = TfidfTransformer()
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    X_train = tfidf_transformer.fit_transform(X_train)
    X_test = tfidf_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_tfidf_combination.append(pipeline_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))




In [87]:
import pickle
pickle.dump(gram_no_tfidf_review, open('no_punc_no_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_review, open('no_punc_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_no_tfidf_title, open('no_punc_no_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_title, open('no_punc_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_no_tfidf_combination, open('no_punc_no_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_combination, open('no_punc_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))

In [88]:
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(amazon['Title'],
                                                                             amazon['Sentiment'], test_size = 0.2)
X_train_review, X_test_review, y_train_review, y_test_review = train_test_split(amazon['Review'],
                                                                             amazon['Sentiment'], test_size = 0.2)
X_train_combination, X_test_combination, y_train_combination, y_test_combination = train_test_split(amazon['Combination_all'],
                                                                             amazon['Sentiment'], test_size = 0.2)

In [89]:
gram_no_tfidf_title = []
gram_tfidf_title = []

 
def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features =15000, ngram_range = (1, 2))
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_no_tfidf_title.append(pipeline_no_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))
    
 
def pipeline_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features = 15000, ngram_range = (1, 2))
    tfidf_transformer = TfidfTransformer()
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    X_train = tfidf_transformer.fit_transform(X_train)
    X_test = tfidf_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_tfidf_title.append(pipeline_tfidf(X_train_title, X_test_title, y_train_title, y_test_title))




In [90]:
gram_no_tfidf_review = []
gram_tfidf_review = []

 
def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features =15000, ngram_range = (1, 2))
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_no_tfidf_review.append(pipeline_no_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))
    
 
def pipeline_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features = 15000, ngram_range = (1, 2))
    tfidf_transformer = TfidfTransformer()
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    X_train = tfidf_transformer.fit_transform(X_train)
    X_test = tfidf_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_tfidf_review.append(pipeline_tfidf(X_train_review, X_test_review, y_train_review, y_test_review))




In [91]:
gram_no_tfidf_combination = []
gram_tfidf_combination = []

 
def pipeline_no_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features =15000, ngram_range = (1, 2))
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_no_tfidf_combination.append(pipeline_no_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))
    
 
def pipeline_tfidf(X_train, X_test, y_train, y_test):
    train_errors = []
    test_errors = []
    bow_transformer = CountVectorizer(analyzer='word', max_features = 15000, ngram_range = (1, 2))
    tfidf_transformer = TfidfTransformer()
    X_train = bow_transformer.fit_transform(X_train)
    X_test = bow_transformer.transform(X_test)
    X_train = tfidf_transformer.fit_transform(X_train)
    X_test = tfidf_transformer.transform(X_test)
    for i in model_list:
        train_error, test_error = calc_metrics(X_train, y_train, X_test, y_test, i)
        train_errors.append(train_error)
        test_errors.append(test_error)
        error = {'train_error' : train_errors, 'test_error': test_errors}
    return error
gram_tfidf_combination.append(pipeline_tfidf(X_train_combination, X_test_combination, y_train_combination, y_test_combination))




In [92]:
import pickle
pickle.dump(gram_no_tfidf_review, open('all_no_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_review, open('all_tfidf_review_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_no_tfidf_title, open('all_no_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_title, open('all_tfidf_title_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_no_tfidf_combination, open('all_no_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))
pickle.dump(gram_tfidf_combination, open('all_tfidf_combination_sentiment_multi_normal_5.sav', 'wb'))