In [1]:
import glob
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer

In [2]:
# Test sentence
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
input_sentence = "Hello my name is Jin"
encoded = tokenizer(input_sentence)
print('Input sentence: ', input_sentence, '\n')
print('Encoded: ', encoded, '\n')
print('Decoded: ', tokenizer.decode(encoded['input_ids']), '\n')


Input sentence:  Hello my name is Jin 

Encoded:  {'input_ids': [101, 7592, 2026, 2171, 2003, 9743, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]} 

Decoded:  [CLS] hello my name is jin [SEP] 



In [3]:
# Get all review files
files = glob.glob(r'./Reviews/*[0-9].csv')
print(files)

['./Reviews/product028.csv', './Reviews/product014.csv', './Reviews/product001.csv', './Reviews/product015.csv', './Reviews/product029.csv', './Reviews/product003.csv', './Reviews/product017.csv', './Reviews/product016.csv', './Reviews/product002.csv', './Reviews/product006.csv', './Reviews/product012.csv', './Reviews/product013.csv', './Reviews/product007.csv', './Reviews/product011.csv', './Reviews/product005.csv', './Reviews/product004.csv', './Reviews/product010.csv', './Reviews/product009.csv', './Reviews/product021.csv', './Reviews/product020.csv', './Reviews/product008.csv', './Reviews/product022.csv', './Reviews/product023.csv', './Reviews/product027.csv', './Reviews/product026.csv', './Reviews/product030.csv', './Reviews/product024.csv', './Reviews/product018.csv', './Reviews/product019.csv', './Reviews/product025.csv']


In [4]:
# Concat all review data from different products into one big dataframe
df_list = []

for file in files:
    df = pd.read_csv(file)
    df_list.append(df)
    
df = pd.concat(df_list, axis=0, ignore_index=True)
df

Unnamed: 0,comment,stars,verified,date,country,helpful,has-media
0,The noise cancellation is awesome and works we...,5,1,2017-9-13,1,2,0
1,After seeing all the high reviews for these he...,1,1,2018-12-21,1,1,1
2,Absolutely love these. I have no issues. Gener...,5,1,2020-8-19,1,1,0
3,I have tried many other brands of headphones a...,5,1,2017-10-24,1,2,1
4,The right swivel axis broke AGAIN after 6 mont...,1,1,2018-1-28,1,1,0
...,...,...,...,...,...,...,...
120384,Love this device. So helpful,5,1,2020-1-15,1,0,0
120385,I love everything about it very nice easy to c...,5,0,2020-2-17,1,0,0
120386,Love this machine. Now calibrated as my home g...,5,1,2020-8-19,1,0,0
120387,"Love my echo 8, have in kitchen, easy to get r...",5,1,2020-2-5,1,0,0


In [5]:
# Check for missing values
df.isnull().any()

comment       True
stars        False
verified     False
date         False
country      False
helpful      False
has-media    False
dtype: bool

In [6]:
# Find reviews with missing comments
missing_indices = df[df['comment'].isnull()].index.tolist()
print('Number of reviews missing comments: ', len(missing_indices))
print('Missing indices: ', missing_indices)

Number of reviews missing comments:  17
Missing indices:  [19697, 25781, 43301, 52167, 67005, 67045, 73724, 76041, 79221, 79929, 83309, 92274, 93990, 94006, 111184, 120116, 120346]


In [7]:
# See what an example review with missing comment looks like
print(df.iloc[120116])

comment            NaN
stars                5
verified             1
date         2020-3-13
country              1
helpful              1
has-media            0
Name: 120116, dtype: object


In [8]:
print('Max comment length (of all products): ', int(df.comment.str.len().max()))

Max comment length (of all products):  5127


In [9]:
# Get only the comments and star (labels) data
comment_df = df[['comment', 'stars']]
comment_df

Unnamed: 0,comment,stars
0,The noise cancellation is awesome and works we...,5
1,After seeing all the high reviews for these he...,1
2,Absolutely love these. I have no issues. Gener...,5
3,I have tried many other brands of headphones a...,5
4,The right swivel axis broke AGAIN after 6 mont...,1
...,...,...
120384,Love this device. So helpful,5
120385,I love everything about it very nice easy to c...,5
120386,Love this machine. Now calibrated as my home g...,5
120387,"Love my echo 8, have in kitchen, easy to get r...",5


In [10]:
# print first 5 reviews
for idx, row in comment_df[:5].iterrows():
    print(row['comment'] + '\n')

The noise cancellation is awesome and works well with cancelling out noises such as: 55 gallon fish tank with tetra air pump and filters; Honeywell hepa air purifier 50250S on high mode; central a/c on; dogs snoring.I can still hear people talking if they are 10ft away having a full conversation but it does help some. (I tested it in the office) For the price it meets my personal needs. The sound quality works equal or better than my iPhone 7s connecting headset. Sync to Bluetooth was a breeze. Love the case and doesn't take long to charge. The headphone is comfortable. Quality seems well bit. I will post an update if it breaks. :)

After seeing all the high reviews for these headphones, I purchased them with the expectation that they were going to be amazing, I came to the realization that those reviews were not accurate. I’m of the belief that those high reviews were left after the initial use of the headphones.Construction quality - They feel flimsy at best. The earphones are comfor

In [11]:
class ReviewsDataset:
    def __init__(self, df, max_length=1024):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # input=review, label=rating
        review = self.df.loc[idx, 'comment']
        rating = int(self.df.loc[idx, 'stars'])
        
        encoded = self.tokenizer(
            review,                      # review to encode
            max_length=self.max_length,  # Truncate all segments to max_length
            padding='max_length',        # pad all reviews with the [PAD] token to the max_length
            return_attention_mask=True,  # Construct attention masks.
            return_tensors='pt'
        )
        
        input_ids = encoded['input_ids']
        attn_mask = encoded['attention_mask']
        
        return {
            'input_ids': input_ids, 
            'attn_mask': attn_mask, 
            'rating': rating
        }

In [13]:
data_set = ReviewsDataset(df, 1024)
print(df.iloc[0]['comment'])
print(data_set[0])

The noise cancellation is awesome and works well with cancelling out noises such as: 55 gallon fish tank with tetra air pump and filters; Honeywell hepa air purifier 50250S on high mode; central a/c on; dogs snoring.I can still hear people talking if they are 10ft away having a full conversation but it does help some. (I tested it in the office) For the price it meets my personal needs. The sound quality works equal or better than my iPhone 7s connecting headset. Sync to Bluetooth was a breeze. Love the case and doesn't take long to charge. The headphone is comfortable. Quality seems well bit. I will post an update if it breaks. :)
{'input_ids': tensor([[ 101, 1996, 5005,  ...,    0,    0,    0]]), 'attn_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]), 'rating': 5}
