For introduction and problem statement, please refer to notebook 1

## Content 

**Notebook 1: 1_cellphones_reviews_data_cleaning_and_eda**
- Data Import and Cleaning
- Exploratory Data Analysis
- Text Data Pre-processing

**Notebook 2: 2_cellphones_reviews_topic modelling**
- Data Import
- Topic Modelling with Gensim

**Notebook 3: 3_cellphones_reviews_topic_analysis_and_visualizations**
- Findings and Analysis of Topic Modelling

**Notebook 4: 4_features_extractions_and_sentiment_analysis**
- [Data Import](#Data-Import)
- [Sentiment Analysis with VADER](#Sentiment-Analysis-with-VADER)
- [entiment Analysis with Logistic Regression(Multi-Class Classification)](#Sentiment-Analysis-with-Logistic-Regression-Classifier)
- [Evaluation of Sentiment Analysis with BERT(Multi-Class Classification)](#Evaluation-of-Sentiment-Analysis-with-BERT)   
Please refer to notebook 5 for the fine-tuning process of pre-trained BERT model
- Comparison of the 3 Methods 
- Recommendation and Conclusion 
- Future Steps

**Notebook 5: fine_tuning_of_BERT_model**   
The reason why this notebook is separated from notebook 4 which contains the evaluation of BERT model is because the fine-tuning of BERT model requires GPU. Hence, the model was fine-tuned on Google Colaboratory and loaded back into notebook 4 for evaluation


## Data Import

In [11]:

new_reviews  = pickle.load(open('../data/reviews_with_feature_sentiments.pkl', 'rb'))

In [None]:
#new_reviews.shape

In [None]:
#import random

#seed_val = 42
#random.seed(seed_val)
#np.random.seed(seed_val)
#torch.manual_seed(seed_val)
#torch.cuda.manual_seed_all(seed_val)

In [None]:
#from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [1]:
import pandas as pd 
import numpy as np
from nltk import tokenize
from transformers import BertForSequenceClassification
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, SequentialSampler
#from tqdm.notebook import tqdm
import pickle

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.load_state_dict(torch.load('../data/finetuned_BERT_epoch_2_3classes.model', map_location=torch.device('cpu')))

## Predictions on Feature Level with BERT

### Loading Tokenizer and Encoding Data by Sentences

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

batch_size=32

def sentences_with_keywords (reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summarised_reviews = set()
    texts = tokenize.sent_tokenize(reviews)
    for sentence in texts:
        sentence = sentence.lower()
        for word in list_of_keywords:
            if word in sentence:
                summarised_reviews.add(sentence)
    
    summarised_reviews = list(summarised_reviews)
    
    return summarised_reviews

def bert_sentiments (reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    
    summary = set()
    
    summarised_reviews = sentences_with_keywords (reviews)
    
    encoded_data_features = tokenizer.batch_encode_plus(
    summarised_reviews, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

    input_ids_features = encoded_data_features['input_ids']
    attention_masks_features = encoded_data_features['attention_mask']
    #labels_features = torch.tensor(df[df.data_type=='val'].label.values)

    dataset_features = TensorDataset(input_ids_features, attention_masks_features)

    dataloader_features = DataLoader(dataset_features , 
                                       sampler=SequentialSampler(dataset_features ), 
                                       batch_size=batch_size)

    
    model.eval()


    for batch in dataloader_features:
        #batch = tuple(b for b in batch)

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                     }

    with torch.no_grad():        
        outputs = model(**inputs)
        
        
    
    rating_score = torch.argmax(outputs[0],dim=1)

    
    try:
        
        predicted_ratings = []

        for score in rating_score:
            if float(score) == 2.0:
                rating = 5
                predicted_ratings.append(rating)
            elif float(score) == 1.0:
                rating = 3
                predicted_ratings.append(rating)
            else:
                rating = 1
                predicted_ratings.append(rating)

        for i,cleaned_sentence in enumerate(summarised_reviews):        
            for word in list_of_keywords:
                if word in cleaned_sentence:
                    summary.add((float(predicted_ratings[i]),word))
    except:
        summary.add(np.nan)
        
                
    
    return summary



#bert_sentiments(new_reviews.sentences_with_keywords.values[5]) 

In [32]:
test_input = "love the camera. but the battery is not good"

In [41]:
output = dict(bert_sentiments (test_input))
output = {v: int(k) for k, v in output.items()}

In [42]:
output

{'camera': 5, 'battery': 3}

In [24]:
for i,ele in enumerate(output):
    print(ele)

(1.0, 'camera')
(5.0, 'battery')


In [14]:
new_reviews.reviews.values[5]

"works great, but don't dropt it I've had this phone for over a year and I really like it. Having never been partial to flip phones, I can appreciate this Nokia's features. The color screen is very nice and all of the features are easy to use. The keylock is a simple two-button sequence though I do wish it had a timer for the keylock like my old phone. It has a lot of useful features, like a calculator, organizer, stopwatch and alarm, but the three games it has are too lame to play. The customizable features are nice, but lacking in variety. This isn't the ideal phone for a guy. You're few options are pretty girly. The ringtones also lack something to be desired, but there are a few decent ones. The battery life is pretty good, nothing special, but definitely not bad. It gets me through the average day without problems and sometimes lasts for several days. My parents, sisters, and husband all got fancy Samsung flip phones with cameras and all the toys, and I must say mine works better.

In [31]:
bert_sentiments(new_reviews.reviews.values[5]) 

{(1.0, 'battery'),
 (1.0, 'camera'),
 (5.0, 'battery'),
 (5.0, 'camera'),
 (5.0, 'ringtones'),
 (5.0, 'screen')}

In [None]:
bert_sentiments(new_reviews.sentences_with_keywords.values[5]) 

In [None]:
new_reviews.head(5)

In [None]:
new_reviews['features_and_sentiments'][5]

In [None]:
tqdm.pandas()

In [None]:
new_reviews['bert_analysis'] = new_reviews['sentences_with_keywords'].progress_apply(bert_sentiments)

In [None]:
new_reviews['bert_analysis'] = new_reviews['bert_analysis'].map(lambda x:list(x))

In [None]:
new_reviews.to_csv("../data/reviews_with_feature_sentiments.csv",index=False)

In [None]:
new_reviews.to_pickle("../data/reviews_with_feature_sentiments.pkl")

## Mean ratings by features of each unique product

In [None]:
new_reviews.reset_index(inplace=True,drop=True)

In [None]:
unique_asins = new_reviews['asin'].unique()

In [None]:
new_reviews.loc[1,'features_and_sentiments']

In [None]:
all_products = {}

#for cell in new_review['features_and_sentiments']: 
all_features=set()

for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[]}
    for idx in new_reviews.index:
        if new_reviews.loc[idx,'asin'] == product:
            for feature in new_reviews.loc[idx,'features_and_sentiments']:
                all_features.add(feature[1])
                if feature[1] =='battery':
                    all_products[product]['battery'].append(feature[0])
                elif feature[1]  == 'camera':
                    all_products[product]['camera'].append(feature[0])
                elif feature[1]  == 'charger':
                    all_products[product]['charger'].append(feature[0])
                elif feature[1] == 'screen':
                    all_products[product]['screen'].append(feature[0])
                elif feature[1] == 'fingerprint':
                    all_products[product]['fingerprint'].append(feature[0])
        

In [None]:
all_features

In [None]:
for key_1,value_1 in all_products.items():
    for key_2,value_2 in all_products[key_1].items():
        try:
            all_products[key_1][key_2] = round(np.mean(all_products[key_1][key_2]),1)
        except:
            all_products[key_1][key_2] = np.nan

In [None]:
mean_ratings = pd.DataFrame(all_products).T

In [None]:
mean_ratings.reset_index(inplace=True)
mean_ratings

In [None]:
mean_ratings.rename(columns={'index':'asin'},inplace=True)

In [None]:
updated_mean_ratings = pd.merge(mean_ratings,new_reviews[['asin','item_title']],on='asin',how='inner')
updated_mean_ratings.drop_duplicates(subset=['asin'],keep='first',inplace=True)

In [None]:
updated_mean_ratings.reset_index(inplace=True,drop=True)

In [None]:
updated_mean_ratings.tail(20)

In [None]:
## Mean ratings by features of each unique product

new_reviews.reset_index(inplace=True,drop=True)

unique_asins = new_reviews['asin'].unique()

new_reviews.loc[1,'features_and_sentiments']


all_products = {}

#for cell in new_review['features_and_sentiments']: 
all_features=set()

for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[]}
    for idx in new_reviews.index:
        if new_reviews.loc[idx,'asin'] == product:
            for feature in new_reviews.loc[idx,'features_and_sentiments']:
                all_features.add(feature[1])
                if feature[1] =='battery':
                    all_products[product]['battery'].append(feature[0])
                elif feature[1]  == 'camera':
                    all_products[product]['camera'].append(feature[0])
                elif feature[1]  == 'charger':
                    all_products[product]['charger'].append(feature[0])
                elif feature[1] == 'screen':
                    all_products[product]['screen'].append(feature[0])
                elif feature[1] == 'fingerprint':
                    all_products[product]['fingerprint'].append(feature[0])
        

for key_1,value_1 in all_products.items():
    for key_2,value_2 in all_products[key_1].items():
        try:
            all_products[key_1][key_2] = round(np.mean(all_products[key_1][key_2]),1)
        except:
            all_products[key_1][key_2] = np.nan

mean_ratings = pd.DataFrame(all_products).T

mean_ratings.reset_index(inplace=True)
mean_ratings

mean_ratings.rename(columns={'index':'asin'},inplace=True)

updated_mean_ratings = pd.merge(mean_ratings,new_reviews[['asin','item_title']],on='asin',how='inner')
updated_mean_ratings.drop_duplicates(subset=['asin'],keep='first',inplace=True)

updated_mean_ratings.reset_index(inplace=True,drop=True)

updated_mean_ratings.tail(20)