# Project Objective

The main objective is to perform Sentiment Analysis on customer reviews on a Brazilian E-Commerce platform using Natural Language Processing ('NLP'), to determine if the reviews are positive or negative overall.

# Import Libraries

In [3]:
# Import essential libraries
import os
import time
import json
import random
import pandas as pd
import numpy as np
import re
import pickle
from datetime import datetime

# For visualization
# import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import nltk libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download(['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger'])

# Import SQLite libraries
from sqlalchemy import create_engine

# Import sklearn libraries
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPClassifier

# Import PyTorch libraries
import torch, torchvision
print(torch.__version__) # Check PyTorch version

# PyTorch to use CPU instead of GPU
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ImportError: cannot import name 'ft2font' from partially initialized module 'matplotlib' (most likely due to a circular import) (C:\Users\fucheng.liew\Anaconda3\lib\site-packages\matplotlib\__init__.py)

# Settings

In [None]:
# To view all generated results
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Widen the Jupyter Notebook as much as possible
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

# Import Data

***The Brazilian E-Commerce Public Dataset by [Olist](https://olist.com/pt-br/)***: https://www.kaggle.com/olistbr/brazilian-ecommerce<br>
The dataset contains information of 100k orders from 2016 to 2018 such as order status, price, payment and freight performance to customer location, product attributes, etc.<br><br>
The focal point here would be the **reviews written by the customers**.<br>
Once the customer receives the product, or when the estimated delivery date is due (whether customer receives the product or not), the customer gets a satisfaction survey by email to describe the purchase experience and write down some comments.

In [None]:
# Setting the directories
cd = os.getcwd()
data_dir = cd + '\\data'

# Import data
review_df = pd.read_csv(data_dir + '\\olist_order_reviews_dataset.csv')
review_df.head()

In [None]:
# View dataset info
review_df.info()

In [None]:
# View any missing values
review_df.isnull().sum()

In [None]:
# Import other key features such as product category and location to discover more about the reviews
order_df = pd.read_csv(data_dir + '\\olist_orders_dataset.csv') # Includes data on all the orders made
order_item_df = pd.read_csv(data_dir + '\\olist_order_items_dataset.csv') # Includes data about the items purchased within each order
prod_df = pd.read_csv(data_dir + '\\olist_products_dataset.csv') # Includes data about the products sold by Olist
prod_trans_df = pd.read_csv(data_dir + '\\product_category_name_translation.csv') # Translates the productcategoryname to english.
cust_df = pd.read_csv(data_dir + '\\olist_customers_dataset.csv') # Includes data about the customer and its location

# Show samples, dataset info and missing values info
df_list = [order_df, order_item_df, prod_df, prod_trans_df, cust_df]
for df_ in df_list:
    # Show samples
    print(df_.head())
    print('\n')
    
    # Show dataset info
    print(df_.info())
    print('\n')
    
    # Show missing values info
    print(df_.isnull().sum())
    print('\n')

# Data Cleaning

In [None]:
# Keep key columns
df = review_df[['review_id', 'order_id', 'review_score', 'review_comment_title', 'review_comment_message']]

# Remove missing reviews from review dataset
df = df[~df['review_comment_message'].isnull()]
len(df) # Around 41k reviews remain

In [None]:
# Check for any duplicates
len(df[df.duplicated()])

In [None]:
# Check for any duplicates based on review_id
len(df[df.duplicated(subset='review_id')])

In [None]:
# View samples of duplicated review_id
df[df.duplicated(subset='review_id')].head(10)

In [None]:
# View dataset of first 2 samples
df[df['review_id'].isin(['3242cc306a9218d0377831e175d62fbf', '308316408775d1600dad81bd3184556d'])]

# The review_id has duplicates due to the same review for multiple products ordered

In [None]:
# Check for any duplicates based on review_id and review_comment_message
len(df[df.duplicated(subset=['review_id', 'review_comment_message'])])

# This duplicate count here is same as that based on review_id, which needs to be removed

In [None]:
# Remove duplicates
df.drop_duplicates(subset=['review_id', 'review_comment_message'], inplace=True)
len(df)

In [None]:
# View data
df.head()

In [None]:
# Filter key columns
order_df = order_df[['order_id', 'customer_id', 'order_delivered_customer_date', 'order_estimated_delivery_date']]
order_item_df = order_item_df[['order_id', 'product_id']]
prod_df = prod_df[['product_id', 'product_category_name']]
cust_df = cust_df[['customer_id', 'customer_city', 'customer_state']]

# List each dataset and its key ID for data merging
df_list_updated = [order_df, order_item_df, prod_df, prod_trans_df, cust_df]
df_key_id_list = ['order_id', 'order_id', 'product_id', 'product_category_name', 'customer_id']

# Remove duplicates and merge with other datasets to extract key features
print(len(df)) # Get dataset row count before merging to check for any duplicates

i = 0 # Set counter to loop through the list
for df_ in df_list_updated:
    
    # Get respective key id from df_key_id_list
    id_ = df_key_id_list[i]
    print(id_)
    
    # Remove duplicates before merging
    print('Merge with df using ' + id_ + ':')
    print('\n')
    df_.drop_duplicates(subset=[id_], inplace=True)
    df = df.merge(df_, on=id_, how='left')
    
    i += 1 # Update counter
    
print(len(df)) # The dataset row count remain unchanged after merging, dataset shape remained intact

# View data
df.head()

In [None]:
# View any missing values
df.isnull().sum()

# Data Exploration

In [None]:
# Determine if the delivery was late or never arrived (missing order_delivered_customer_date is assumed to never arrive)
df[['order_estimated_delivery_date', 'order_delivered_customer_date']] = df[['order_estimated_delivery_date', 'order_delivered_customer_date']].astype(np.datetime64)
df['delivery_late_ind'] = 0
df.loc[(df['order_delivered_customer_date'].isnull()) | 
       (df['order_delivered_customer_date'] > df['order_estimated_delivery_date'] + pd.Timedelta(days=1)), 'delivery_late_ind'] = 1

# View distribution
print(df['delivery_late_ind'].value_counts().sort_index())

# View data samples
df[df['delivery_late_ind'] > 0].head() 

# First review message is all capitalized, expressing dissatisfaction
# However, only a small % of reviews are due to late deliveries, may not be a good feature in distinguishing positive / negative sentiments

In [None]:
# Determine if entire review_comment_message is capitalized
df['capitalize_ind'] = 0
df.loc[df['review_comment_message'] == df['review_comment_message'].str.upper(), 'capitalize_ind'] = 1

# View distribution
print(df['capitalize_ind'].value_counts().sort_index())

# View data samples
df[df['capitalize_ind'] > 0].head()

# Only a small % of reviews are fully capitalized, may not be a good feature in distinguishing positive / negative sentiments

In [None]:
# Determine length of review_comment_message
df['review_message_length'] = df['review_comment_message'].apply(lambda x: len(x.split()))

# View distribution
print(df['review_message_length'].value_counts().sort_index())

# View data samples
df[df['review_message_length'] <= 3].head()

View breakdown of attributes and the review score given.

In [None]:
attribute_list = ['product_category_name_english', 'customer_city', 'customer_state', 
                  'delivery_late_ind', 'review_message_length', 'capitalize_ind']

for att in attribute_list:
    # Generate the average score
    avg_score_table = pd.pivot_table(df, index=att, values='review_score', aggfunc=np.mean, fill_value=0).reset_index()
    
    # Get the distribution of scores,
    score_dist_table = pd.pivot_table(df, index=att, columns='review_score', values='review_id', aggfunc='count', fill_value=0).reset_index()
    score_count_total = pd.pivot_table(df, index=att, values='review_id', aggfunc='count', fill_value=0).reset_index()
    score_dist_table = score_dist_table.merge( score_count_total, on=att, how='left')
    score_dist_table = score_dist_table.set_index(att)
    score_dist_perc_table = score_dist_table.div(score_dist_table.iloc[:,-1], axis=0)
    
    # Merge the data together
    combined_table = score_dist_perc_table.merge(avg_score_table, on=att, how='left')
    
    # Show table
    print(att)
    print(combined_table)
    
# From a quick glance, the review score tends to be higher if:
# a. Review message is shorter
# b. Delivery is on time or earlier

# Given that the focus is on the web app, only input would be the review text message itself
# Thus, the city and product category will not be focused
# However, these features will help enhance our understanding of the data

# Data Preparation

This section is focused on preparing the dataset based on the content of the comments itself, along with other relevant attributes disclosed above.

In [None]:
# Create the tokenizer function to extract tokens for Tfidf transformer after this
def tokenizer(text):
    # Remove punctuations 
    detected_punctuations = re.findall('[^a-zA-Z0-9]', text)
    for punctuation in detected_punctuations:
        text = text.replace(punctuation, ' ')
        
    # Remove words with single letters
    text = ' '.join([w for w in text.split() if len(w) > 1])
        
    # Tokenize the words
    tokens = word_tokenize(text)    
    
    # Lemmanitizer to reduce words to its stems
    lemmatizer = WordNetLemmatizer()

    # List of clean tokens
    clean_tokens = [lemmatizer.lemmatize(w).lower().strip() for w in tokens]
    
    # Remove stopwords in Portugese
    por_stopwords = stopwords.words('portuguese')
    for st in por_stopwords:
        if st in clean_tokens:
            clean_tokens.remove(st)
            
    return clean_tokens

Creating the response variable of whether the review is positive or negative based on review score with:<br>
positive_review_ind = 1 if score is 4 or 5; else = 0<br><br>
This assumption is necessary for supervised learning to enable the model to learn if it's a positive review or not.<br>
Due to time constraint, each review_comment_message is not manually assessed to label it as positive or not.<br>
However, this assumption is reasonable as customers would tend to write positive review given a high score (4 or 5).

In [None]:
# Creating the response variable
df['positive_review_ind'] = 0
df.loc[df['review_score'] >= 4, 'positive_review_ind'] = 1

# View distribution
print(df['review_score'].value_counts().sort_index())
df['positive_review_ind'].value_counts().sort_index()

# Model Training

In [None]:
# Create the predictor and response datasets
X = df['review_comment_message']#, 'product_category_name_english', 'customer_city', 'delivery_late_ind', 'review_message_length', 'capitalize_ind']]
y = df['positive_review_ind']

# Separate into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

In [None]:
# Based on attributes observed earlier, longer comments tend to associate with negative sentiments (lower review score)
class GetReviewLength(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_len = pd.Series(X).apply(lambda x: len(x.split()))
        return pd.DataFrame(X_len)

In [None]:
# Create ML pipeline - Random Forest
pipeline = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenizer)),
                ('tfidf', TfidfTransformer())
            ])),

            ('review_length', GetReviewLength())
        ])),

        ('clf', RandomForestClassifier(n_jobs=-1, verbose=2)) # Use all processors
    ])

# Train the model using grid search
pipeline.fit(X_train, y_train)

In [26]:
# Create ML pipeline - AdaBoost
pipeline2 = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenizer)),
                ('tfidf', TfidfTransformer())
            ])),

            ('review_length', GetReviewLength())
        ])),

        ('clf', AdaBoostClassifier())
    ])

# Train the model using grid search
pipeline2.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer(tokenizer=<function tokenizer at 0x0000021351B8D8B0>)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())])),
                                                ('review_length',
                                                 GetReviewLength())])),
                ('clf', AdaBoostClassifier())])

In [28]:
# Create ML pipeline - Gradient Boosting
pipeline3 = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenizer)),
                ('tfidf', TfidfTransformer())
            ])),

            ('review_length', GetReviewLength())
        ])),

        ('clf', GradientBoostingClassifier(verbose=2))
    ])

# Train the model using grid search
pipeline3.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.2512           52.10s
         2           1.2102           51.36s
         3           1.1782           46.57s
         4           1.1496           44.63s
         5           1.1250           42.66s
         6           1.1051           41.51s
         7           1.0866           40.15s
         8           1.0714           39.10s
         9           1.0554           38.32s
        10           1.0410           38.05s
        11           1.0289           37.21s
        12           1.0157           36.56s
        13           1.0062           35.83s
        14           0.9963           35.24s
        15           0.9851           34.73s
        16           0.9753           34.00s
        17           0.9670           33.50s
        18           0.9598           33.03s
        19           0.9521           32.53s
        20           0.9459           32.01s
        21           0.9395           31.45s
        2

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer(tokenizer=<function tokenizer at 0x0000021351B8D8B0>)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())])),
                                                ('review_length',
                                                 GetReviewLength())])),
                ('clf', GradientBoostingClassifier(verbose=2))])

In [33]:
# Create ML pipeline - Neural Network
pipeline4 = Pipeline([
        ('features', FeatureUnion([

            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenizer)),
                ('tfidf', TfidfTransformer())
            ])),

            ('review_length', GetReviewLength())
        ])),

        ('clf', MLPClassifier(verbose=True))
    ])

# Train the model using grid search
pipeline4.fit(X_train, y_train)

Iteration 1, loss = 0.46827144
Iteration 2, loss = 0.29535110
Iteration 3, loss = 0.25586749
Iteration 4, loss = 0.23336080
Iteration 5, loss = 0.21872221
Iteration 6, loss = 0.20711621
Iteration 7, loss = 0.19723907
Iteration 8, loss = 0.18996318
Iteration 9, loss = 0.18353546
Iteration 10, loss = 0.17786924
Iteration 11, loss = 0.17286173
Iteration 12, loss = 0.17016895
Iteration 13, loss = 0.16646619
Iteration 14, loss = 0.16284918
Iteration 15, loss = 0.15871599
Iteration 16, loss = 0.15720155
Iteration 17, loss = 0.15458850
Iteration 18, loss = 0.15201395
Iteration 19, loss = 0.14917835
Iteration 20, loss = 0.14775034
Iteration 21, loss = 0.14512442
Iteration 22, loss = 0.14341034
Iteration 23, loss = 0.14214968
Iteration 24, loss = 0.13979895
Iteration 25, loss = 0.13792400
Iteration 26, loss = 0.13625243
Iteration 27, loss = 0.13441570
Iteration 28, loss = 0.13277934
Iteration 29, loss = 0.12986178
Iteration 30, loss = 0.12823776
Iteration 31, loss = 0.12804123
Iteration 32, los



Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('vect',
                                                                  CountVectorizer(tokenizer=<function tokenizer at 0x0000021351B8D8B0>)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())])),
                                                ('review_length',
                                                 GetReviewLength())])),
                ('clf', MLPClassifier(verbose=True))])

In [34]:
# For stacking ensemble classifier
estimators = [
    ('rf', RandomForestClassifier()), # Using the bagging technique
    ('ab', AdaBoostClassifier()) # Using the boosting technique
]

# Create ML pipeline
pipeline5 = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenizer)),
        ('tfidf', TfidfTransformer()),
        ('clf', StackingClassifier(estimators=estimators, 
                                   final_estimator=LogisticRegression(),
                                   verbose=2))
    ])

# Train pipeline
pipeline5.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   56.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenizer at 0x0000021351B8D8B0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 StackingClassifier(estimators=[('rf',
                                                 RandomForestClassifier()),
                                                ('ab', AdaBoostClassifier())],
                                    final_estimator=LogisticRegression(),
                                    verbose=2))])

In [35]:
# Create ML pipeline - Random Forest (Grid Search)
parameters = {
        'clf__n_estimators': [100, 200],
        'clf__min_samples_split': [2, 4]
    }

ml_pipeline = GridSearchCV(pipeline, param_grid=parameters, cv=4)

# Train the model using grid search
ml_pipeline.fit(X_train, y_train)

# Output best selection of parameters
print(ml_pipeline.best_params_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   22.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]:

building tree 1 of 200building tree 2 of 200
building tree 3 of 200
building tree 4 of 200

building tree 5 of 200
building tree 6 of 200building tree 7 of 200

building tree 8 of 200
building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.4s


building tree 33 of 200building tree 34 of 200

building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
building tree 43 of 200
building tree 44 of 200
building tree 45 of 200
building tree 46 of 200
building tree 47 of 200
building tree 48 of 200
building tree 49 of 200
building tree 50 of 200
building tree 51 of 200
building tree 52 of 200
building tree 53 of 200
building tree 54 of 200
building tree 55 of 200
building tree 56 of 200
building tree 57 of 200
building tree 58 of 200
building tree 59 of 200
building tree 60 of 200
building tree 61 of 200
building tree 62 of 200
building tree 63 of 200
building tree 64 of 200
building tree 65 of 200
building tree 66 of 200
building tree 67 of 200
building tree 68 of 200
building tree 69 of 200
building tree 70 of 200
building tree 71 of 200
building tree 72 of 200
building tree 73 of 200
building tree 74

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   13.7s


building tree 156 of 200
building tree 157 of 200
building tree 158 of 200
building tree 159 of 200
building tree 160 of 200
building tree 161 of 200
building tree 162 of 200
building tree 163 of 200
building tree 164 of 200
building tree 165 of 200
building tree 166 of 200
building tree 167 of 200
building tree 168 of 200
building tree 169 of 200
building tree 170 of 200
building tree 171 of 200
building tree 172 of 200
building tree 173 of 200
building tree 174 of 200
building tree 175 of 200
building tree 176 of 200
building tree 177 of 200
building tree 178 of 200
building tree 179 of 200
building tree 180 of 200
building tree 181 of 200
building tree 182 of 200
building tree 183 of 200
building tree 184 of 200
building tree 185 of 200
building tree 186 of 200
building tree 187 of 200
building tree 188 of 200
building tree 189 of 200
building tree 190 of 200
building tree 191 of 200
building tree 192 of 200
building tree 193 of 200
building tree 194 of 200
building tree 195 of 200


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   19.0s finished


# Model Evaluation

In [36]:
# Predict on test data - Random Forest
y_pred = pipeline.predict(X_test)

# show metrics
print(f'Overall accuracy: {np.round(100 * (y_pred == y_test).mean().mean(), 2)} %')
print(classification_report(y_test.values, y_pred))

Overall accuracy: 87.48 %
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      3524
           1       0.91      0.90      0.90      6643

    accuracy                           0.87     10167
   macro avg       0.86      0.86      0.86     10167
weighted avg       0.88      0.87      0.87     10167



[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [37]:
# Predict on test data - AdaBoost
y_pred2 = pipeline2.predict(X_test)

# show metrics
print(f'Overall accuracy: {np.round(100 * (y_pred2 == y_test).mean().mean(), 2)} %')
print(classification_report(y_test.values, y_pred2))

Overall accuracy: 83.64 %
              precision    recall  f1-score   support

           0       0.80      0.70      0.75      3524
           1       0.85      0.91      0.88      6643

    accuracy                           0.84     10167
   macro avg       0.83      0.80      0.81     10167
weighted avg       0.83      0.84      0.83     10167



In [38]:
# Predict on test data - Gradient Boosting
y_pred3 = pipeline3.predict(X_test)

# show metrics
print(f'Overall accuracy: {np.round(100 * (y_pred3 == y_test).mean().mean(), 2)} %')
print(classification_report(y_test.values, y_pred3))

Overall accuracy: 83.97 %
              precision    recall  f1-score   support

           0       0.80      0.72      0.76      3524
           1       0.86      0.90      0.88      6643

    accuracy                           0.84     10167
   macro avg       0.83      0.81      0.82     10167
weighted avg       0.84      0.84      0.84     10167



In [39]:
# Predict on test data - Neural Network
y_pred4 = pipeline4.predict(X_test)

# show metrics
print(f'Overall accuracy: {np.round(100 * (y_pred4 == y_test).mean().mean(), 2)} %')
print(classification_report(y_test.values, y_pred4))

Overall accuracy: 86.41 %
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      3524
           1       0.89      0.90      0.90      6643

    accuracy                           0.86     10167
   macro avg       0.85      0.85      0.85     10167
weighted avg       0.86      0.86      0.86     10167



In [40]:
# Predict on test data - Ensemble Stacking
y_pred5 = pipeline5.predict(X_test)

# show metrics
print(f'Overall accuracy: {np.round(100 * (y_pred5 == y_test).mean().mean(), 2)} %')
print(classification_report(y_test.values, y_pred5))

Overall accuracy: 87.25 %
              precision    recall  f1-score   support

           0       0.82      0.81      0.81      3524
           1       0.90      0.91      0.90      6643

    accuracy                           0.87     10167
   macro avg       0.86      0.86      0.86     10167
weighted avg       0.87      0.87      0.87     10167



In [41]:
# Predict on test data - Random Forest (Grid Search)
y_pred_final = ml_pipeline.predict(X_test)

# show metrics
print(f'Overall accuracy: {np.round(100 * (y_pred_final == y_test).mean().mean(), 2)} %')
print(classification_report(y_test.values, y_pred_final))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s


Overall accuracy: 87.48 %
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      3524
           1       0.91      0.90      0.90      6643

    accuracy                           0.87     10167
   macro avg       0.86      0.86      0.86     10167
weighted avg       0.88      0.87      0.87     10167



[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.2s finished


In summary:
- AdaBoost & Gradient Boosting takes a short time, but poor accuracy
- Ensemble Stacking & Neural Network achieves good accuracy levels, but take a long time
- Random Forest not only achieves good accuracy levels on par with more complex model, it achieved in a short time as AdaBoost & Gradient Boosting. Therefore, it is chosen along with GridSearch for further model improvements.

# Output Model

In [42]:
pickle.dump(ml_pipeline, open(cd + '\\sentiment_classifier.pkl', 'wb'))

In [2]:
pickle.dump(pipeline, open(cd + '\\sentiment_classifier.pkl', 'wb'))

NameError: name 'pipeline' is not defined