For introduction and problem statement, please refer to notebook 1

## Content 

**Notebook 1: 1_cellphones_reviews_data_cleaning_and_eda**
- Data Import and Cleaning
- Exploratory Data Analysis
- Text Data Pre-processing

**Notebook 2: 2_cellphones_reviews_topic modelling**
- Data Import
- Topic Modelling with Gensim

**Notebook 3: 3_cellphones_reviews_topic_analysis_and_visualizations**
- Findings and Analysis of Topic Modelling

**Notebook 4: 4_features_extractions_and_sentiment_analysis**
- [Data Import](#Data-Import)
- [Sentiment Analysis with VADER](#Sentiment-Analysis-with-VADER)
- [entiment Analysis with Logistic Regression(Multi-Class Classification)](#Sentiment-Analysis-with-Logistic-Regression-Classifier)
- [Evaluation of Sentiment Analysis with BERT(Multi-Class Classification)](#Evaluation-of-Sentiment-Analysis-with-BERT)   
Please refer to notebook 5 for the fine-tuning process of pre-trained BERT model
- Comparison of the 3 Methods 
- Recommendation and Conclusion 
- Future Steps

**Notebook 5: fine_tuning_of_BERT_model**   
The reason why this notebook is separated from notebook 4 which contains the evaluation of BERT model is because the fine-tuning of BERT model requires GPU. Hence, the model was fine-tuned on Google Colaboratory and loaded back into notebook 4 for evaluation


## Data Import

In [20]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import spacy
from nltk import tokenize
from nltk.corpus import stopwords 
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
import pickle

In [46]:
new_reviews  = pickle.load(open('../data/reviews_with_feature_sentiments.pkl', 'rb'))

## Comparing VADER vs Logistics Regression vs BERT sentiment analysis

In [22]:
new_reviews.columns

Index(['asin', 'name', 'rating', 'date', 'verified', 'review_title', 'body',
       'helpfulVotes', 'brand', 'item_title', 'url', 'image', 'reviewUrl',
       'totalReviews', 'price', 'originalPrice', 'reviews', 'word_count',
       'cleaned_reviews', 'multi_class_sentiment', 'tokens', 'summary',
       'sentences_with_keywords', 'features_and_sentiments', 'filter summary',
       'logreg_pred', 'data_type', 'bert_analysis', 'vader_analysis'],
      dtype='object')

In [47]:
type(new_reviews['logreg_pred'][0])

list

In [24]:
all_features=set()

for idx in new_reviews.index:
    for feature in new_reviews.loc[idx,'features_and_sentiments']:
        all_features.add(feature[1])

In [25]:
all_features #unique features

{'battery',
 'camera',
 'charger',
 'fingerprint',
 'ringtones',
 'screen',
 'simcard',
 'touchscreen'}

In [27]:
new_reviews[['vader_analysis','logreg_pred','bert_analysis']]

Unnamed: 0,vader_analysis,logreg_pred,bert_analysis
0,"[(5, battery), (5, screen)]","[(5, battery), (5, screen)]","[(5.0, battery), (5.0, screen)]"
1,"[(3, ringtones)]","[(5, ringtones)]","[(1.0, ringtones), (5.0, ringtones)]"
2,"[(5, battery), (5, charger)]","[(5, battery), (5, charger)]","[(5.0, battery), (5.0, charger)]"
3,"[(5, ringtones)]","[(5, ringtones)]","[(5.0, ringtones)]"
4,"[(3, ringtones)]","[(5, ringtones)]","[(3.0, ringtones)]"
...,...,...,...
22036,"[(3, camera), (5, screen), (3, battery), (5, c...","[(5, screen), (1, screen), (5, camera), (5, fi...","[(5.0, screen), (5.0, camera), (5.0, fingerpri..."
22037,"[(1, ringtones), (5, screen), (1, fingerprint)...","[(1, ringtones), (5, fingerprint), (5, screen)...","[(3.0, ringtones), (5.0, screen), (5.0, camera..."
22038,"[(3, screen), (5, camera), (3, camera)]","[(1, screen), (5, camera)]","[(3.0, screen), (5.0, camera)]"
22039,"[(5, screen), (5, camera)]","[(5, screen), (5, camera)]","[(5.0, screen), (5.0, camera)]"


In [67]:
dummy = {'battery':[]}
for feature in new_reviews['logreg_pred'][0]:
    if feature[1] == 'battery':
        dummy['battery'].append(feature[0])

dummy

{'battery': [5]}

In [62]:
def mean_ratings (feature_ratings):
    all_features = {}
    for feature in feature_ratings:
        if feature[1]  == 'camera':
            all_features['camera'] = all_features['camera'].append(feature[0])
        elif feature[1] == 'battery':
            all_features['battery'] = all_features['battery'].append(feature[0])
        elif feature[1] == 'fingerprint':
            all_features ['fingerprint'] = all_features ['fingerprint'].append(feature[0])  
        elif feature[1] == 'screen':
            all_features ['screen'] = all_features ['screen'].append(feature[0])   
        elif feature[1]  == 'charger':
            all_features ['charger'] = all_features ['charger'].append(feature[0])
        elif feature[1] == 'touchscreen':
            all_features ['touchscreen'] = all_features ['touchscreen'].append(feature[0])
        elif feature[1] == 'simcard':
            all_features ['simcard'] = all_features ['simcard'].append(feature[0])
        elif feature[1] == 'ringtones':
            all_features ['ringtones'] = all_features ['ringtones'].append(feature[0])        
    try:
        all_features_mean = {key:np.mean(value) for key,value in all_features.items()}
    except:
        all_features_mean = {key:np.nan for key,value in all_features.items()}
    
    return all_features_mean

In [63]:
mean_ratings(new_reviews['logreg_pred'][2])

KeyError: 'battery'

In [52]:
new_reviews['logreg_pred'] = new_reviews['logreg_pred'].map(mean_ratings)

KeyError: 'battery'

In [36]:
new_reviews['logreg_pred'][22036]

[(5, 'screen'),
 (1, 'screen'),
 (5, 'camera'),
 (5, 'fingerprint'),
 (5, 'battery')]

In [34]:
new_reviews['logreg_pred'][22036]

{'camera': 5.0,
 'battery': 5.0,
 'fingerprint': 5.0,
 'screen': 3.0,
 'charger': nan,
 'touchscreen': nan,
 'simcard': nan,
 'ringtones': nan}

In [None]:
all_products = {}
unique_asins = new_reviews['asin'].unique()
for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[],
                                 'touchscreen':[],'simcard':[],'ringtones':[]}
    for features_dict in new_reviews['logreg_pred']:

        for key,value in features_dict.items():
            if key == 'camera':
                all_products[product]['camera'].append(value)
            elif key =='battery':
                all_products[product]['battery'].append(value)
            elif key == 'fingerprint':
                all_products[product]['fingerprint'].append(value)  
            elif key == 'screen':
                all_products[product]['screen'].append(value)   
            elif key  == 'charger':
                all_products[product]['charger'].append(value)
            elif key == 'touchscreen':
                all_products[product]['touchscreen'].append(value)
            elif key == 'simcard':
                all_products[product]['simcard'].append(value)
            elif key == 'ringtones':
                all_products[product]['ringtones'].append(value)      

## Mean ratings by features of each unique product

In [None]:
new_reviews.reset_index(inplace=True,drop=True)

In [None]:
unique_asins = new_reviews['asin'].unique()

In [None]:
new_reviews.loc[1,'features_and_sentiments']

In [None]:
all_products = {}

#for cell in new_review['features_and_sentiments']: 
all_features=set()

for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[]}
    for idx in new_reviews.index:
        if new_reviews.loc[idx,'asin'] == product:
            for feature in new_reviews.loc[idx,'features_and_sentiments']:
                all_features.add(feature[1])
                if feature[1] =='battery':
                    all_products[product]['battery'].append(feature[0])
                elif feature[1]  == 'camera':
                    all_products[product]['camera'].append(feature[0])
                elif feature[1]  == 'charger':
                    all_products[product]['charger'].append(feature[0])
                elif feature[1] == 'screen':
                    all_products[product]['screen'].append(feature[0])
                elif feature[1] == 'fingerprint':
                    all_products[product]['fingerprint'].append(feature[0])
        

In [None]:
for key_1,value_1 in all_products.items():
    for key_2,value_2 in all_products[key_1].items():
        try:
            all_products[key_1][key_2] = round(np.mean(all_products[key_1][key_2]),1)
        except:
            all_products[key_1][key_2] = np.nan

In [None]:
mean_ratings = pd.DataFrame(all_products).T

In [None]:
mean_ratings.reset_index(inplace=True)
mean_ratings

In [None]:
mean_ratings.rename(columns={'index':'asin'},inplace=True)

In [None]:
updated_mean_ratings = pd.merge(mean_ratings,new_reviews[['asin','item_title']],on='asin',how='inner')
updated_mean_ratings.drop_duplicates(subset=['asin'],keep='first',inplace=True)

In [None]:
updated_mean_ratings.reset_index(inplace=True,drop=True)

In [None]:
updated_mean_ratings.tail(20)

In [None]:
## Mean ratings by features of each unique product

new_reviews.reset_index(inplace=True,drop=True)

unique_asins = new_reviews['asin'].unique()

new_reviews.loc[1,'features_and_sentiments']


all_products = {}

#for cell in new_review['features_and_sentiments']: 
all_features=set()

for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[]}
    for idx in new_reviews.index:
        if new_reviews.loc[idx,'asin'] == product:
            for feature in new_reviews.loc[idx,'features_and_sentiments']:
                all_features.add(feature[1])
                if feature[1] =='battery':
                    all_products[product]['battery'].append(feature[0])
                elif feature[1]  == 'camera':
                    all_products[product]['camera'].append(feature[0])
                elif feature[1]  == 'charger':
                    all_products[product]['charger'].append(feature[0])
                elif feature[1] == 'screen':
                    all_products[product]['screen'].append(feature[0])
                elif feature[1] == 'fingerprint':
                    all_products[product]['fingerprint'].append(feature[0])
        

for key_1,value_1 in all_products.items():
    for key_2,value_2 in all_products[key_1].items():
        try:
            all_products[key_1][key_2] = round(np.mean(all_products[key_1][key_2]),1)
        except:
            all_products[key_1][key_2] = np.nan

mean_ratings = pd.DataFrame(all_products).T

mean_ratings.reset_index(inplace=True)
mean_ratings

mean_ratings.rename(columns={'index':'asin'},inplace=True)

updated_mean_ratings = pd.merge(mean_ratings,new_reviews[['asin','item_title']],on='asin',how='inner')
updated_mean_ratings.drop_duplicates(subset=['asin'],keep='first',inplace=True)

updated_mean_ratings.reset_index(inplace=True,drop=True)

updated_mean_ratings.tail(20)