In [5]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import spacy
from nltk import tokenize
from nltk.corpus import stopwords 
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [7]:
new_reviews = pd.read_csv('../data/cleaned_combined_data.csv',na_filter=False)

In [8]:
#iphone_xs = new_reviews[new_reviews['asin'] == 'B07RT1X4FJ']

In [9]:
#iphone_xs['url'][63202]

In [10]:
new_words = {
    'new': 3.0
}

analyser.lexicon.update(new_words)

In [11]:
stop_words = stopwords.words('english')

In [12]:
len(stop_words)

179

In [13]:
negation_words = ['ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', 
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 
'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 
'wouldn', "wouldn't","not","no",'don',"don't"]

for word in negation_words:
    stop_words.remove(word)

len(stop_words)

139

In [340]:
def sentences_with_keywords (reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summary = set()
    texts = tokenize.sent_tokenize(reviews)
    for sentence in texts:
        sentence = sentence.lower()
        for word in list_of_keywords:
            if word in sentence:
                summary.add(sentence)
                
    return list(summary)

In [14]:
def summarise_reviews (reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summary = set()
    texts = tokenize.sent_tokenize(reviews)
    for sentence in texts:
        sentence = sentence.lower()
        for word in list_of_keywords:
            if word in sentence:
                # Remove HTML.
                post_text = BeautifulSoup(sentence).get_text()

                # Remove non-letters.
                letters_only = ' '.join(re.findall(r"[A-z’]+",post_text))

                # Convert to lower case, split into individual words.
                words = letters_only.lower().split()

                #convert the stopwords to a set.
                stops = set(stop_words)

                # Remove stopwords.
                meaningful_words = [w for w in words if w not in stops]

                # Stemming 
                #p_stemmer = PorterStemmer()
                #meaningful_words = [p_stemmer.stem(w) for w in meaningful_words]

                #Lemmatize
                lemmatizer = WordNetLemmatizer()
                meaningful_words = [lemmatizer.lemmatize(word) for word in meaningful_words]

                cleaned_sentence = (" ".join(meaningful_words))
                
                summary.add(cleaned_sentence)
                
    return list(summary)

In [15]:
def features_and_sentiments (summarised_reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    summary = set()
    
    for cleaned_sentence in summarised_reviews:
        
        for word in list_of_keywords:
            if word in cleaned_sentence:
                score = analyser.polarity_scores(cleaned_sentence)
                compound = score['compound']

                if compound >= 0.075:
                    sentiment_score = 5
                elif compound >= 0.05:
                    sentiment_score = 4
                elif compound <= -0.075:
                    sentiment_score = 1
                elif compound <= -0.05:
                    sentiment_score = 2
                else:
                    sentiment_score = 3

                summary.add((sentiment_score,word))
    return list(summary)

In [16]:
new_reviews['summary'] = new_reviews['reviews'].apply(summarise_reviews)

In [341]:
new_reviews['sentences_with_keywords'] = new_reviews['reviews'].apply(sentences_with_keywords)

In [17]:
new_reviews['features_and_sentiments'] = new_reviews['summary'].apply(features_and_sentiments)

In [18]:
pd.set_option('display.max_colwidth',None)
new_reviews[['reviews','summary']].sample(2)

Unnamed: 0,reviews,summary
20059,Five Stars Perfect!,[]
30258,It provides great coverage! Long lasting BATTERY! Big Screen GREAT phone all the Way,"[big screen great phone way, long lasting battery]"


In [19]:
new_reviews["filter summary"] = new_reviews['summary'].apply(lambda x: x != [])

In [20]:
new_reviews = new_reviews[new_reviews["filter summary"] == True]

In [21]:
new_reviews.shape

(22041, 24)

In [146]:
#new_reviews.to_csv('../data/cleaned_combined_data_with_keywords.csv',index=False)

In [342]:
iphone_xs = new_reviews[new_reviews['asin'] == 'B07RT1X4FJ']

In [319]:
pd.set_option('display.max_colwidth',None)
iphone_xs['reviews']

63202                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Honestly, it was worth it I was very hesitant about buying an iPhone off of Amazon, but I did it and was not disappointed. It came with 100% battery life, so it basical

In [317]:
iphone_xs['summary'][63325]

['expected much spent new screen found phone warped',
 'disappointment phone screen pretty rough shape got']

In [318]:
pd.set_option('display.max_colwidth',None)
iphone_xs['features_and_sentiments']

63202    [(3, charger), (5, battery), (5, fingerprint), (5, camera)]
63205                                                 [(5, battery)]
63209                                                 [(3, charger)]
63210                                                  [(1, screen)]
63212                                                 [(1, battery)]
63213                                                 [(3, battery)]
63214                                                 [(5, charger)]
63216                                                 [(1, battery)]
63217                   [(1, fingerprint), (5, screen), (5, camera)]
63218                                     [(5, screen), (1, screen)]
63219                                                 [(1, battery)]
63221                                                 [(5, battery)]
63222                                                 [(1, battery)]
63225                                                 [(5, battery)]
63226                             

In [22]:
iphone_xs.loc[63202,'features_and_sentiments']

[(5, 'battery'), (5, 'camera'), (3, 'charger'), (5, 'fingerprint')]

## Mean ratings by features of each unique product

In [23]:
new_reviews.reset_index(inplace=True,drop=True)

In [24]:
unique_asins = new_reviews['asin'].unique()

In [25]:
new_reviews.loc[1,'features_and_sentiments']

[(3, 'ringtones')]

In [26]:
all_products = {}

#for cell in new_review['features_and_sentiments']: 
all_features=set()

for product in unique_asins:
    all_products[product] = {'camera':[],'battery':[],'fingerprint':[],'screen':[],'charger':[]}
    for idx in new_reviews.index:
        if new_reviews.loc[idx,'asin'] == product:
            for feature in new_reviews.loc[idx,'features_and_sentiments']:
                all_features.add(feature[1])
                if feature[1] =='battery':
                    all_products[product]['battery'].append(feature[0])
                elif feature[1]  == 'camera':
                    all_products[product]['camera'].append(feature[0])
                elif feature[1]  == 'charger':
                    all_products[product]['charger'].append(feature[0])
                elif feature[1] == 'screen':
                    all_products[product]['screen'].append(feature[0])
                elif feature[1] == 'fingerprint':
                    all_products[product]['fingerprint'].append(feature[0])
        

In [27]:
for key_1,value_1 in all_products.items():
    for key_2,value_2 in all_products[key_1].items():
        try:
            all_products[key_1][key_2] = round(np.mean(all_products[key_1][key_2]),1)
        except:
            all_products[key_1][key_2] = np.nan

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [28]:
mean_ratings = pd.DataFrame(all_products).T

In [29]:
mean_ratings.reset_index(inplace=True)
mean_ratings

Unnamed: 0,index,camera,battery,fingerprint,screen,charger
0,B0000SX2UC,3.0,3.6,,3.8,5.0
1,B000SKTZ0S,3.0,3.4,,5.0,3.0
2,B001AO4OUC,3.8,2.0,,2.3,3.0
3,B001DCJAJG,,5.0,,,5.0
4,B001GQ3DJM,,3.0,,5.0,3.0
...,...,...,...,...,...,...
626,B07Z8BL2VW,4.0,3.8,1.0,5.0,4.3
627,B07ZDJCL76,4.3,,,3.0,3.0
628,B07ZHPCJW3,3.0,,3.0,5.0,
629,B07ZQSGP53,5.0,,,,


In [30]:
mean_ratings.rename(columns={'index':'asin'},inplace=True)

In [31]:
updated_mean_ratings = pd.merge(mean_ratings,new_reviews[['asin','item_title']],on='asin',how='inner')
updated_mean_ratings.drop_duplicates(subset=['asin'],keep='first',inplace=True)

In [32]:
updated_mean_ratings.reset_index(inplace=True,drop=True)

In [33]:
updated_mean_ratings.tail(20)

Unnamed: 0,asin,camera,battery,fingerprint,screen,charger,item_title
611,B07Y8Q17N9,,,3.0,,,"Xiaomi Mi 9 Lite (64GB, 6GB RAM) 6.39"" Display, Dual SIM GSM Factory Unlocked - US & Global 4G LTE International Version (Onyx Grey, 64 GB)"
612,B07Y8XK9GC,4.4,4.4,4.0,4.7,4.2,"Xiaomi Redmi Note 8 Pro (64GB, 6GB) 6.53"", 64MP Quad Camera, Helio G90T Gaming Processor, Dual SIM GSM Unlocked - US & Global 4G LTE International Version (Pearl White, 64 GB)"
613,B07Y8YWTFL,4.4,4.4,4.0,4.7,4.2,"Xiaomi Redmi Note 8 Pro 64GB, 6GB RAM 6.53"" LTE GSM 64MP Factory Unlocked Smartphone - Global Model (Mineral Grey)"
614,B07YBVZ2PW,5.0,,5.0,4.0,,"ASUS ROG Phone 2 (ZS660KL) Smartphone 128GB ROM 8GB RAM Snapdragon 855 Plus 6000 mAh NFC Android 9.0 - GSM Only International Version, No Warranty (Black)"
615,B07YF4PB4R,,,,1.0,,"Xiaomi Mi 9 Lite (128GB, 6GB RAM) 6.39"" Display, Dual SIM GSM Factory Unlocked - US & Global 4G LTE International Version (Aurora Blue, 128 GB)"
616,B07YJHXZT6,3.9,3.8,,3.7,3.0,"Xiaomi Redmi Note 8 64GB + 4GB RAM, 6.3"" LTE 48MP Factory Unlocked GSM Smartphone - International Version (Moonlight White)"
617,B07YMNLXL3,4.0,3.3,3.0,3.8,2.8,Google Pixel 4 - Just Black - 64GB - Unlocked
618,B07YQ58NPF,4.4,4.4,2.3,3.4,3.0,"Xiaomi Redmi Note 8 Pro 128GB, 6GB RAM 6.53"" LTE GSM 64MP Smartphone - Global Model (Mineral Grey)"
619,B07YVGVQKL,,,2.0,5.0,,"Samsung Galaxy A50 SM-A505G 128GB, Dual Sim, 6.4"" Infinity-U Display, Triple Camera, 4GB RAM, GSM Unlocked International Model, No Warranty (White)"
620,B07YVH6QRT,,5.0,,,3.0,"Samsung Original Galaxy Watch Active2 w/; auto Workout Tracking, and pace Coaching Enhanced Sleep Tracking Analysis Stainless Steel CASE and Leather Band (International Model) (Black, 44mm) No LTE"


## With BERT

In [25]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [26]:
new_reviews.head(2)

Unnamed: 0,asin,name,rating,date,verified,review_title,body,helpfulVotes,brand,item_title,...,price,originalPrice,sellers,reviews,cleaned_reviews,pos_neg_reviews,tokens,summary,features_and_sentiments,filter summary
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,...,0.0,0.0,Flip n Smart Phones,"Def not best, but not worst I had the Samsung ...",def best worst samsung awhile absolute doo doo...,0,"['def', 'best', 'worst', 'samsung', 'awhile', ...","[nice bright large screen, ring tone loud enou...","[(5, screen), (5, battery)]",True
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,...,0.0,0.0,Flip n Smart Phones,"Love This Phone This is a great, reliable phon...",love great reliable also purchased samsung die...,1,"['love', 'great', 'reliable', 'also', 'purchas...",[however ringtones not available online downlo...,"[(3, ringtones)]",True


In [27]:
new_reviews.shape

(22041, 24)

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
new_reviews['rating'].value_counts(normalize=True)

5    0.468854
1    0.185291
4    0.164738
3    0.096502
2    0.084615
Name: rating, dtype: float64

In [52]:
len(new_reviews['rating'].unique())

5

In [38]:
X_train, X_val, y_train, y_val = train_test_split(new_reviews.index.values,
                                                  new_reviews.rating.values,
                                                  test_size = 0.25,
                                                  random_state= 42,
                                                  stratify=new_reviews.rating.values)

In [120]:
y_train.shape

(16530,)

In [121]:
y_val.shape

(5511,)

In [39]:
new_reviews['data_type'] = ['not_set']*new_reviews.shape[0]

In [40]:
new_reviews.loc[X_train, 'data_type'] = 'train'
new_reviews.loc[X_val, 'data_type'] = 'val'

In [42]:
new_reviews.head(2)

Unnamed: 0,asin,name,rating,date,verified,review_title,body,helpfulVotes,brand,item_title,...,originalPrice,sellers,reviews,cleaned_reviews,pos_neg_reviews,tokens,summary,features_and_sentiments,filter summary,data_type
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,...,0.0,Flip n Smart Phones,"Def not best, but not worst I had the Samsung ...",def best worst samsung awhile absolute doo doo...,0,"['def', 'best', 'worst', 'samsung', 'awhile', ...","[nice bright large screen, ring tone loud enou...","[(5, screen), (5, battery)]",True,val
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,...,0.0,Flip n Smart Phones,"Love This Phone This is a great, reliable phon...",love great reliable also purchased samsung die...,1,"['love', 'great', 'reliable', 'also', 'purchas...",[however ringtones not available online downlo...,"[(3, ringtones)]",True,train


In [43]:
new_reviews.groupby(['rating', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,asin,name,date,verified,review_title,body,helpfulVotes,brand,item_title,url,...,price,originalPrice,sellers,reviews,cleaned_reviews,pos_neg_reviews,tokens,summary,features_and_sentiments,filter summary
rating,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,train,3063,3063,3063,3063,3063,3063,3063,3063,3063,3063,...,3063,3063,3063,3063,3063,3063,3063,3063,3063,3063
1,val,1021,1021,1021,1021,1021,1021,1021,1021,1021,1021,...,1021,1021,1021,1021,1021,1021,1021,1021,1021,1021
2,train,1399,1399,1399,1399,1399,1399,1399,1399,1399,1399,...,1399,1399,1399,1399,1399,1399,1399,1399,1399,1399
2,val,466,466,466,466,466,466,466,466,466,466,...,466,466,466,466,466,466,466,466,466,466
3,train,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595,...,1595,1595,1595,1595,1595,1595,1595,1595,1595,1595
3,val,532,532,532,532,532,532,532,532,532,532,...,532,532,532,532,532,532,532,532,532,532
4,train,2723,2723,2723,2723,2723,2723,2723,2723,2723,2723,...,2723,2723,2723,2723,2723,2723,2723,2723,2723,2723
4,val,908,908,908,908,908,908,908,908,908,908,...,908,908,908,908,908,908,908,908,908,908
5,train,7750,7750,7750,7750,7750,7750,7750,7750,7750,7750,...,7750,7750,7750,7750,7750,7750,7750,7750,7750,7750
5,val,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,...,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584


## Loading Tokenizer and

In [2]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [36]:
new_reviews.columns

Index(['asin', 'name', 'rating', 'date', 'verified', 'review_title', 'body',
       'helpfulVotes', 'brand', 'item_title', 'url', 'image', 'reviewUrl',
       'totalReviews', 'price', 'originalPrice', 'sellers', 'reviews',
       'cleaned_reviews', 'pos_neg_reviews', 'tokens', 'summary',
       'features_and_sentiments', 'filter summary'],
      dtype='object')

In [46]:
encoded_data_train = tokenizer.batch_encode_plus(
    new_reviews[new_reviews.data_type=='train'].reviews.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    new_reviews[new_reviews.data_type=='val'].reviews.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(new_reviews[new_reviews.data_type=='train'].rating.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val= torch.tensor(new_reviews[new_reviews.data_type=='val'].rating.values)

In [128]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train-1)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val-1)

In [129]:
len(dataset_train)

16530

In [130]:
len(dataset_val)

5511

## Setting up BERT Pretrained Model

In [131]:
from transformers import BertForSequenceClassification

In [132]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=5,
                                                      output_attentions=False,
                                                      output_hidden_states=False)


## Creating Data Loaders

In [133]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [134]:
batch_size = 32

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [135]:
dataloader_train 

<torch.utils.data.dataloader.DataLoader at 0x7fe129585dd0>

In [136]:
16530/32

516.5625

## Setting Up Optimiser and Scheduler

In [137]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [138]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)

In [139]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

## Defining our Performance Metrics

In [165]:
def accuracy_per_class(preds, labels):
    #label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    correct_pred = 0
    total_count = 0
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
        
        correct_pred = correct_pred + len(y_preds[y_preds==label])
        total_count = total_count + len(y_true)
        
    print(f'Total Accuracy:{correct_pred/total_count}' )

## Creating Training Loop

In [141]:
import random

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [142]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cpu


In [152]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [148]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=5,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [167]:
model.load_state_dict(torch.load('../data/finetuned_BERT_epoch_2.model', map_location=torch.device('cpu')))

<All keys matched successfully>

In [230]:
_, predictions, true_vals = evaluate(dataloader_validation)

HBox(children=(FloatProgress(value=0.0, max=173.0), HTML(value='')))




In [231]:
accuracy_per_class(predictions, true_vals)

Class: 0
Accuracy: 840/1021

Class: 1
Accuracy: 123/466

Class: 2
Accuracy: 193/532

Class: 3
Accuracy: 434/908

Class: 4
Accuracy: 2268/2584

Total Accuracy:0.7000544365813827


## Loading Tokenizer and Encoding Data by Sentences

In [178]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [345]:
def bert_sentiments (summarised_reviews):
    list_of_keywords = ['camera','screen','battery','simcard','touchscreen','fingerprint','fingerprints',
                        'ringtones','charger']
    
    summary = set()
    
    encoded_data_features = tokenizer.batch_encode_plus(
    summarised_reviews, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

    input_ids_features = encoded_data_features['input_ids']
    attention_masks_features = encoded_data_features['attention_mask']
    #labels_features = torch.tensor(df[df.data_type=='val'].label.values)

    dataset_features = TensorDataset(input_ids_features, attention_masks_features)

    dataloader_features = DataLoader(dataset_features , 
                                       sampler=SequentialSampler(dataset_features ), 
                                       batch_size=batch_size)

    
    model.eval()
    
    predictions = []

    for batch in dataloader_features:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                     }

    with torch.no_grad():        
        outputs = model(**inputs)
        
        
    
    predict_v2 = torch.argmax(outputs[0],dim=1)

    
    rating_score = predict_v2 + 1
    
    for i,cleaned_sentence in enumerate(summarised_reviews):        
        for word in list_of_keywords:
            if word in cleaned_sentence:
                summary.add((float(rating_score[i]),word))
                
    
    return summary



features_and_sentiments (new_reviews.sentences_with_keywords.values[0])   

{(5.0, 'battery'), (5.0, 'screen')}

In [346]:
iphone_xs['bert_analysis'] = iphone_xs['sentences_with_keywords'].apply(bert_sentiments)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [356]:
iphone_xs['reviews'][5:10]

63213                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Charging cable isn’t original! Firstly, this iphone is really like new. Battery life %100. But i don’t believe the charging cable is original. Cuz as seen in the photos, The material is really poor quality, the cable shape is distorted with a slight bending. As is known, it is a factor that affects chargin

In [353]:
iphone_xs['sentences_with_keywords'][5:10]

63213                                                                                                                                                                                                                                                                                                                                         [battery life %100.]
63214                                                                                                                                                                                                                                                                                                                        [and yes i tried different chargers.]
63216                                                                                                                                                                                                                                                                             [it has so many 

In [355]:
iphone_xs['features_and_sentiments'][5:10] #vader

63213                                  [(3, battery)]
63214                                  [(5, charger)]
63216                                  [(1, battery)]
63217    [(1, fingerprint), (5, screen), (5, camera)]
63218                      [(5, screen), (1, screen)]
Name: features_and_sentiments, dtype: object

In [354]:
iphone_xs['bert_analysis'][5:10]

63213                                      {(5.0, battery)}
63214                                      {(5.0, charger)}
63216                                      {(2.0, battery)}
63217    {(3.0, fingerprint), (4.0, screen), (5.0, camera)}
63218                        {(3.0, screen), (4.0, screen)}
Name: bert_analysis, dtype: object