In [25]:
import glob
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer

In [16]:
# Test sentence
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
input_sentence = "Hello my name is Jin"
encoded = tokenizer(input_sentence)
print('Input sentence: ', input_sentence, '\n')
print('Encoded: ', encoded, '\n')
print('Decoded: ', tokenizer.decode(encoded['input_ids']), '\n')


Input sentence:  Hello my name is Jin 

Encoded:  {'input_ids': [101, 7592, 2026, 2171, 2003, 9743, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]} 

Decoded:  [CLS] hello my name is jin [SEP] 



In [36]:
# Get all review files
files = glob.glob(r'./Reviews/*[0-9].csv')
print(files)

['./Reviews/product028.csv', './Reviews/product014.csv', './Reviews/product001.csv', './Reviews/product015.csv', './Reviews/product029.csv', './Reviews/product003.csv', './Reviews/product017.csv', './Reviews/product016.csv', './Reviews/product002.csv', './Reviews/product006.csv', './Reviews/product012.csv', './Reviews/product013.csv', './Reviews/product007.csv', './Reviews/product011.csv', './Reviews/product005.csv', './Reviews/product004.csv', './Reviews/product010.csv', './Reviews/product009.csv', './Reviews/product021.csv', './Reviews/product020.csv', './Reviews/product008.csv', './Reviews/product022.csv', './Reviews/product023.csv', './Reviews/product027.csv', './Reviews/product026.csv', './Reviews/product030.csv', './Reviews/product024.csv', './Reviews/product018.csv', './Reviews/product019.csv', './Reviews/product025.csv']


In [42]:
# Concat all review data from different products into one big dataframe
df_list = []

for file in files:
    df = pd.read_csv(file)
    df_list.append(df)
    
df = pd.concat(df_list, axis=0, ignore_index=True)
df

Unnamed: 0,comment,stars,verified,date,country,helpful,has-media
0,The noise cancellation is awesome and works we...,5,1,2017-9-13,1,2,0
1,After seeing all the high reviews for these he...,1,1,2018-12-21,1,1,1
2,Absolutely love these. I have no issues. Gener...,5,1,2020-8-19,1,1,0
3,I have tried many other brands of headphones a...,5,1,2017-10-24,1,2,1
4,The right swivel axis broke AGAIN after 6 mont...,1,1,2018-1-28,1,1,0
...,...,...,...,...,...,...,...
120384,Love this device. So helpful,5,1,2020-1-15,1,0,0
120385,I love everything about it very nice easy to c...,5,0,2020-2-17,1,0,0
120386,Love this machine. Now calibrated as my home g...,5,1,2020-8-19,1,0,0
120387,"Love my echo 8, have in kitchen, easy to get r...",5,1,2020-2-5,1,0,0


In [49]:
# Check for missing values
df.isnull().any()

comment       True
stars        False
verified     False
date         False
country      False
helpful      False
has-media    False
dtype: bool

In [57]:
# Find reviews with missing comments
missing_indices = df[df['comment'].isnull()].index.tolist()
print('Number of reviews missing comments: ', len(missing_indices))
print('Missing indices: ', missing_indices)

Number of reviews missing comments:  17
Missing indices:  [19697, 25781, 43301, 52167, 67005, 67045, 73724, 76041, 79221, 79929, 83309, 92274, 93990, 94006, 111184, 120116, 120346]


In [58]:
# See what an example review with missing comment looks like
print(df.iloc[120116])

comment            NaN
stars                5
verified             1
date         2020-3-13
country              1
helpful              1
has-media            0
Name: 120116, dtype: object


In [66]:
print('Max comment length (of all products): ', int(df.comment.str.len().max()))

Max comment length (of all products):  5127


In [67]:
# Get only the comments and star (labels) data
comment_df = df[['comment', 'stars']]
comment_df

Unnamed: 0,comment,stars
0,The noise cancellation is awesome and works we...,5
1,After seeing all the high reviews for these he...,1
2,Absolutely love these. I have no issues. Gener...,5
3,I have tried many other brands of headphones a...,5
4,The right swivel axis broke AGAIN after 6 mont...,1
...,...,...
120384,Love this device. So helpful,5
120385,I love everything about it very nice easy to c...,5
120386,Love this machine. Now calibrated as my home g...,5
120387,"Love my echo 8, have in kitchen, easy to get r...",5


In [6]:
for idx, row in comment_df.iterrows():
    print(row['comment'] + '\n')

I could sit here and write all about the specs on this computer, but they are already in the description, and If you are like me... you don't really understand it anyways.So I am going to tell you what I LOVE about this computer and what I use it for. I am a full time college student as well as a single mother who stays busy. I have previously used a HP All In one computer that I bought brand new a year ago and I hate that thing... It is so slow!!! When I first opened this item, I was just hoping that it would be a little faster! What I got instead was an amazing computer that is faster than I could have ever imagined. Now I don't use this thing for much more than amazon reviews, school work, and papers. But this is exactly what I needed.

A very reasonably priced laptop for basic computing needs. The specs that stick out to me for describing this as "basic needs" is 4GB of RAM, and 128GB M.2 SSD. Both are at the bare minimum in today's needs. Cell phones now come with those specs( hig


great computer, backlighting on the keyboard is nice touch. great screen & trackpad isn't garbage. its great for homework and a bit of Minecraft. only complaint is the tiny amount of memory. 60% of my memory is taken up, with nothing open. 4GB of ram is barely enough to have more than 2 tabs of chrome open at a time.

Perfect for home use - email, searches, work from home connection.  Works great for me

I really enjoy this laptop. It's been a week now and i've had a great experience so far. I had a mac before and I feel, for the price, this is a great laptop. Would recommend to anyone either using it for simple stuff or even for casual level development and game play.

Fast and beautiful display. Use addon hard drive via usb to store files and downloads.

was really happy. got it on sale for $300 faster than expected. light weight. illuminated keyboard. windows 10 has gotten out of hand with tracking but it is what it is. So far very happy with it.

Love the large screen and being ab


Work and play

Planning to buy another one good choice, fast and quiet

I use this for gaming.

I work overseas as a contractor so I use this computer for a little bit of everything from work to entertainment. I spent a little extra to double the RAM and add an internal HDD but even before I did that, it handled everything I needed it to. Definitely a great computer!

G G or what I paid great deal

I liked the battery life and the 3D Viewer app. Good processor for computer enthusiasts and Windows S interface is pleasing. Had to return because my school email was not compatible w/ the OS and a little heavy for school. Overall 5/5 computer for schoolwork though.

Initial setup caused internet access to go offline.  Learned online that this is a problem because of router generation

This laptop came very well packaged and secure with not a scratch on it. It took about 15 minutes to get it setup and running and I can say I’m very impressed at how fast this laptop runs. Definitely can play


I researched this thing thoroughly before buying due to rave reviews (*FAKE* I'm sure).When it arrived, I disabled 10S immediately so I could avoid all the problems I'd read about. Countless problems began immediately.

Good product

Love my new laptop. It’s sleek and fast. Comes with Windows 10 and the price was great.

Disclaimer: I'm NOT a techie.After several months of using my work laptop for personal use (don't do what I do) and several years of using my MacBook Pro, which was getting incredibly slow and glitchy/buggy (RIP - I used the crap out of that thing), I decided to "downgrade" to something a lot simpler as a post-graduate. My Apple MacBook served me well during my grad school years (and before) and didn't want to drop another rack or two on another MacBook.

Out of all the laptops and computers I've owned this is the slowest and offers the least.

This is a great laptop, fast, and exceptional picture.

Good build quality

Met all expectations, great computer for the pric

In [20]:
class ReviewsDataset:
    def __init__(self, df, max_length=1024):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # input=review, label=rating
        review = self.df.loc[idx, 'comment']
        rating = int(self.df.loc[idx, 'stars'])
        
        encoded = self.tokenizer(
            review,                      # review to encode
            max_length=self.max_length,  # Truncate all segments to max_length
            padding='max_length',        # pad all reviews with the [PAD] token to the max_length
            return_attention_mask=True,  # Construct attention masks.
            return_tensors='pt'
        )
        
        input_ids = encoded['input_ids']
        attn_mask = encoded['attention_mask']
        
        return {
            'input_ids': input_ids, 
            'attn_mask': attn_mask, 
            'rating': rating
        }

In [24]:
data_set = ReviewsDataset(df, 1024)
data_set[0]

{'input_ids': tensor([[ 101, 1045, 2071,  ...,    0,    0,    0]]),
 'attn_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]),
 'rating': 5}