For introduction and problem statement, please refer to notebook 1

## Content 

**Notebook 1: 1_cellphones_reviews_data_cleaning_and_eda**
- Data Import and Cleaning
- Exploratory Data Analysis
- Text Data Pre-processing

**Notebook 2: 2_cellphones_reviews_topic modelling**
- Data Import
- Topic Modelling with Gensim

**Notebook 3: 3_cellphones_reviews_topic_analysis_and_visualizations**
- Findings and Analysis of Topic Modelling

**Notebook 4: 4_features_extractions_and_sentiment_analysis**
- [Data Import](#Data-Import)
- [Sentiment Analysis with VADER](#Sentiment-Analysis-with-VADER)
- [entiment Analysis with Logistic Regression(Multi-Class Classification)](#Sentiment-Analysis-with-Logistic-Regression-Classifier)
- [Evaluation of Sentiment Analysis with BERT(Multi-Class Classification)](#Evaluation-of-Sentiment-Analysis-with-BERT)   
Please refer to notebook 5 for the fine-tuning process of pre-trained BERT model
- Comparison of the 3 Methods 
- Recommendation and Conclusion 
- Future Steps

**Notebook 5: fine_tuning_of_BERT_model**   
The reason why this notebook is separated from notebook 4 which contains the evaluation of BERT model is because the fine-tuning of BERT model requires GPU. Hence, the model was fine-tuned on Google Colaboratory and loaded back into notebook 4 for evaluation


## Data Import

In [37]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import spacy
from nltk import tokenize
from nltk.corpus import stopwords 
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer

In [None]:
new_reviews = pd.read_csv('../data/cleaned_combined_data.csv',na_filter=False)

## Sentiment Analysis with VADER

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
new_words = {
    'new': 3.0
}

analyser.lexicon.update(new_words)

In [None]:
stop_words = stopwords.words('english')

In [None]:
len(stop_words)

In [None]:
negation_words = ['ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', 
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 
'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
'wouldn', "wouldn't","not","no",'don',"don't"]

for word in negation_words:
    stop_words.remove(word)

len(stop_words)

In [None]:
def sentences_with_keywords (reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summary = set()
    texts = tokenize.sent_tokenize(reviews)
    for sentence in texts:
        sentence = sentence.lower()
        for word in list_of_keywords:
            if word in sentence:
                summary.add(sentence)
                
    return list(summary)

In [None]:
def summarise_reviews (reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summary = set()
    texts = tokenize.sent_tokenize(reviews)
    for sentence in texts:
        sentence = sentence.lower()
        for word in list_of_keywords:
            if word in sentence:
                # Remove HTML.
                post_text = BeautifulSoup(sentence).get_text()

                # Remove non-letters.
                letters_only = ' '.join(re.findall(r"[A-zâ€™]+",post_text))

                # Convert to lower case, split into individual words.
                words = letters_only.lower().split()

                #convert the stopwords to a set.
                stops = set(stop_words)

                # Remove stopwords.
                meaningful_words = [w for w in words if w not in stops]

                # Stemming 
                #p_stemmer = PorterStemmer()
                #meaningful_words = [p_stemmer.stem(w) for w in meaningful_words]

                #Lemmatize
                lemmatizer = WordNetLemmatizer()
                meaningful_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

                cleaned_sentence = (" ".join(meaningful_words))
                
                summary.add(cleaned_sentence)
                
    return list(summary)

In [None]:
def features_and_sentiments (summarised_reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summary = set()
    
    for cleaned_sentence in summarised_reviews:
        
        for word in list_of_keywords:
            if word in cleaned_sentence:
                score = analyser.polarity_scores(cleaned_sentence)
                compound = score['compound']

                if compound >= 0.075:
                    sentiment_score = 5
                elif compound >= 0.05:
                    sentiment_score = 4
                elif compound <= -0.075:
                    sentiment_score = 1
                elif compound <= -0.05:
                    sentiment_score = 2
                else:
                    sentiment_score = 3

                summary.add((sentiment_score,word))
    return list(summary)

In [None]:
new_reviews['summary'] = new_reviews['reviews'].apply(summarise_reviews)

In [None]:
new_reviews['sentences_with_keywords'] = new_reviews['reviews'].apply(sentences_with_keywords)

In [None]:
new_reviews['features_and_sentiments'] = new_reviews['summary'].apply(features_and_sentiments)

In [None]:
pd.set_option('display.max_colwidth',None)
new_reviews[['reviews','summary']].sample(2)

In [None]:
new_reviews["filter summary"] = new_reviews['summary'].apply(lambda x: x != [])

In [None]:
new_reviews = new_reviews[new_reviews["filter summary"] == True]

In [None]:
new_reviews.shape

In [None]:
new_reviews.reset_index(inplace=True,drop=True)

In [None]:
new_reviews.head(2)

In [None]:
#new_reviews.to_csv('../data/cleaned_combined_data_with_keywords.csv',index=False)

In [None]:
#iphone_xs = new_reviews[new_reviews['asin'] == 'B07RT1X4FJ']

In [None]:
#pd.set_option('display.max_colwidth',None)
#iphone_xs['reviews']

In [None]:
#iphone_xs['summary'][63325]

In [None]:
#pd.set_option('display.max_colwidth',None)
#iphone_xs['features_and_sentiments']

In [None]:
#iphone_xs.loc[63202,'features_and_sentiments']

## Sentiment Analysis with Logistic Regression Classifier

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
new_reviews['multi_class_sentiment'].unique()

In [None]:
# Create the feature and target variable
X = new_reviews['cleaned_reviews']
y = new_reviews['multi_class_sentiment']

In [None]:
# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.25,
                                                    random_state = 42,
                                                    stratify= y)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
baseline_model = y_train.value_counts(normalize=True)
baseline_accuracy = round(baseline_model[2],3)

print(f"Baseline accuracy: {baseline_accuracy}")

In [None]:
#Instantiate the pipeline
lr_cvec_pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr',LogisticRegression(random_state=42,solver='liblinear', max_iter=10000))
])

#create hyperparameters for gridsearch
lr_cvec_params = {
    'cvec__max_features': [3000,4000,5000],
    'cvec__min_df':[2,3],
    'cvec__max_df':[0.9,0.95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'lr__C':[0.01,0.1,1],
    'lr__penalty': ['l1', 'l2']
}

# Instantiate GridSearchCV.
lr_cvec_gs = GridSearchCV(lr_cvec_pipe, # what object are we optimizing?
                  param_grid=lr_cvec_params , # what parameters values are we searching?
                  cv=5,
                 n_jobs=-1,verbose=1) 

#fit the model
lr_cvec_gs.fit(X_train,y_train)

In [None]:
training_accuracy_score = round(lr_cvec_gs.score(X_train,y_train),3)
testing_accuracy_score = round(lr_cvec_gs.score(X_test,y_test),3)

print(f"Logistic Regression CVEC Train Accuracy Score: {training_accuracy_score}")
print(f"Logistic Regression CVEC Test Accuracy Score: {testing_accuracy_score}")

In [None]:
import pickle
filename= '../data/logreg_3classes.pkl'
pickle.dump(lr_cvec_gs,open(filename,'wb'))

### Predictions on Feature Level with Logistic Regression

In [None]:
def logreg_classification (reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summary = set()
    pred = logreg_model.predict(reviews)
    
    
    for i,cleaned_sentence in enumerate(reviews):        
        for word in list_of_keywords:
            if word in cleaned_sentence:
                summary.add((pred[i],word))
                
    
    return summary

In [None]:
new_reviews['logreg_pred'] = new_reviews['summary'].apply(logreg_classification)

## Evaluation of Sentiment Analysis with BERT

In [None]:
from transformers import BertForSequenceClassification
import torch
from tqdm.notebook import tqdm

In [None]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
len(new_reviews['multi_class_sentiment'].unique())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(new_reviews.index.values,
                                                  new_reviews.multi_class_sentiment.values,
                                                  test_size = 0.25,
                                                  random_state= 42,
                                                  stratify=new_reviews.multi_class_sentiment.values)

In [None]:
y_train.shape

In [None]:
y_val.shape

In [None]:
new_reviews['data_type'] = ['not_set']*new_reviews.shape[0]

In [None]:
new_reviews.loc[X_train, 'data_type'] = 'train'
new_reviews.loc[X_val, 'data_type'] = 'val'

In [None]:
new_reviews.head(2)

In [None]:
new_reviews.groupby(['rating', 'data_type']).count()

## Loading Tokenizer

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [None]:
#prepare the data in a format that is readable by BERT
#since we are not doing fine-tuning of the train data here
#we will only prepare the validation data for evaluation
#the full-fine tuning process including tokening and dataloader 
#of train dataset is available in notebook 5

encoded_data_val = tokenizer.batch_encode_plus(
    new_reviews[new_reviews.data_type=='val'].reviews.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val= torch.tensor(new_reviews[new_reviews.data_type=='val'].multi_class_sentiment.values)

In [None]:
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
len(dataset_val)

## Setting up BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)


## Creating Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 32


dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

## Defining our Performance Metrics

In [None]:
def accuracy_per_class(preds, labels):
    #label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    correct_pred = 0
    total_count = 0
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
        
        correct_pred = correct_pred + len(y_preds[y_preds==label])
        total_count = total_count + len(y_true)
        
    print(f'Total Accuracy:{correct_pred/total_count}' )

In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

In [None]:
model.load_state_dict(torch.load('../data/finetuned_BERT_epoch_2_3classes.model', map_location=torch.device('cpu')))

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [None]:
accuracy_per_class(predictions, true_vals)

## Predictions on Feature Level with BERT

### Loading Tokenizer and Encoding Data by Sentences

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [None]:
def bert_sentiments (summarised_reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    
    summary = set()
    
    encoded_data_features = tokenizer.batch_encode_plus(
    summarised_reviews, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

    input_ids_features = encoded_data_features['input_ids']
    attention_masks_features = encoded_data_features['attention_mask']
    #labels_features = torch.tensor(df[df.data_type=='val'].label.values)

    dataset_features = TensorDataset(input_ids_features, attention_masks_features)

    dataloader_features = DataLoader(dataset_features , 
                                       sampler=SequentialSampler(dataset_features ), 
                                       batch_size=batch_size)

    
    model.eval()
    
    predictions = []

    for batch in dataloader_features:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                     }

    with torch.no_grad():        
        outputs = model(**inputs)
        
        
    
    rating_score = torch.argmax(outputs[0],dim=1)

    try:
        
        predicted_ratings = []

        for score in rating_score:
            if float(score) == 2.0:
                rating = 5
                predicted_ratings.append(rating)
            elif float(score) == 1.0:
                rating = 3
                predicted_ratings.append(rating)
            else:
                rating = 1
                predicted_ratings.append(rating)

        for i,cleaned_sentence in enumerate(summarised_reviews):        
            for word in list_of_keywords:
                if word in cleaned_sentence:
                    summary.add((float(predicted_ratings[i]),word))
    except:
        summary.add(np.nan)
        
                
    
    return summary



bert_sentiments(new_reviews.sentences_with_keywords.values[5]) 

In [None]:
tqdm.pandas()

In [None]:
new_reviews['bert_analysis'] = new_reviews['sentences_with_keywords'].progress_apply(bert_sentiments)

## Mean ratings by features of each unique product

In [None]:
new_reviews.reset_index(inplace=True,drop=True)

In [None]:
unique_asins = new_reviews['asin'].unique()

In [None]:
new_reviews.loc[1,'features_and_sentiments']

In [None]:
all_products = {}

#for cell in new_review['features_and_sentiments']: 
all_features=set()

for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[]}
    for idx in new_reviews.index:
        if new_reviews.loc[idx,'asin'] == product:
            for feature in new_reviews.loc[idx,'features_and_sentiments']:
                all_features.add(feature[1])
                if feature[1] =='battery':
                    all_products[product]['battery'].append(feature[0])
                elif feature[1]  == 'camera':
                    all_products[product]['camera'].append(feature[0])
                elif feature[1]  == 'charger':
                    all_products[product]['charger'].append(feature[0])
                elif feature[1] == 'screen':
                    all_products[product]['screen'].append(feature[0])
                elif feature[1] == 'fingerprint':
                    all_products[product]['fingerprint'].append(feature[0])
        

In [None]:
for key_1,value_1 in all_products.items():
    for key_2,value_2 in all_products[key_1].items():
        try:
            all_products[key_1][key_2] = round(np.mean(all_products[key_1][key_2]),1)
        except:
            all_products[key_1][key_2] = np.nan

In [None]:
mean_ratings = pd.DataFrame(all_products).T

In [None]:
mean_ratings.reset_index(inplace=True)
mean_ratings

In [None]:
mean_ratings.rename(columns={'index':'asin'},inplace=True)

In [None]:
updated_mean_ratings = pd.merge(mean_ratings,new_reviews[['asin','item_title']],on='asin',how='inner')
updated_mean_ratings.drop_duplicates(subset=['asin'],keep='first',inplace=True)

In [None]:
updated_mean_ratings.reset_index(inplace=True,drop=True)

In [None]:
updated_mean_ratings.tail(20)

In [None]:
## Mean ratings by features of each unique product

new_reviews.reset_index(inplace=True,drop=True)

unique_asins = new_reviews['asin'].unique()

new_reviews.loc[1,'features_and_sentiments']

all_products = {}

#for cell in new_review['features_and_sentiments']: 
all_features=set()

for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[]}
    for idx in new_reviews.index:
        if new_reviews.loc[idx,'asin'] == product:
            for feature in new_reviews.loc[idx,'features_and_sentiments']:
                all_features.add(feature[1])
                if feature[1] =='battery':
                    all_products[product]['battery'].append(feature[0])
                elif feature[1]  == 'camera':
                    all_products[product]['camera'].append(feature[0])
                elif feature[1]  == 'charger':
                    all_products[product]['charger'].append(feature[0])
                elif feature[1] == 'screen':
                    all_products[product]['screen'].append(feature[0])
                elif feature[1] == 'fingerprint':
                    all_products[product]['fingerprint'].append(feature[0])
        

for key_1,value_1 in all_products.items():
    for key_2,value_2 in all_products[key_1].items():
        try:
            all_products[key_1][key_2] = round(np.mean(all_products[key_1][key_2]),1)
        except:
            all_products[key_1][key_2] = np.nan

mean_ratings = pd.DataFrame(all_products).T

mean_ratings.reset_index(inplace=True)
mean_ratings

mean_ratings.rename(columns={'index':'asin'},inplace=True)

updated_mean_ratings = pd.merge(mean_ratings,new_reviews[['asin','item_title']],on='asin',how='inner')
updated_mean_ratings.drop_duplicates(subset=['asin'],keep='first',inplace=True)

updated_mean_ratings.reset_index(inplace=True,drop=True)

updated_mean_ratings.tail(20)