In [4]:
import glob
import pandas as pd
import urllib.request

!python3 --version

Python 3.6.9


In [5]:
# Grab the list of URLs.
url = "https://raw.githubusercontent.com/csbanon/bert-product-rating-predictor/master/data/file_names.txt"
file = urllib.request.urlopen(url)
lines = [line for line in file.readlines()]
file.close()

url_list = []

# Convert the data to strings.
for line in lines:
  curr_line = line.decode("utf-8")
  url_list.append(curr_line[0:len(curr_line) - 1])

df_list = []

# Read the data into a Pandas DataFrame.
for url in url_list:
  df = pd.read_csv(url)
  df_list.append(df)

# Concatenate all DataFrames.
df = pd.concat(df_list, axis=0, ignore_index=True)
df

Unnamed: 0,comment,stars,verified,date,country,helpful,has-media
0,I could sit here and write all about the specs...,5,0,2019-6-25,1,3,0
1,A very reasonably priced laptop for basic comp...,4,0,2019-7-2,1,1,1
2,"This is the best laptop deal you can get, full...",5,1,2019-8-3,1,1,1
3,A few months after the purchase....It is still...,5,1,2019-7-12,1,1,0
4,BUYER BE AWARE: This computer has Microsoft 10...,1,1,2019-8-7,1,6,0
...,...,...,...,...,...,...,...
195786,I have not tried this camera without the SD ca...,5,1,2019-12-29,1,0,0
195787,"Hello, I bought this item months ago and I tho...",1,1,2019-4-28,1,0,0
195788,This is an incredible camera for the money!! ...,5,1,2019-6-3,1,0,0
195789,Great cameras. Purchased some for my mother af...,5,1,2020-6-12,1,1,0


In [6]:
# Check for missing values.
df.isnull().any()

comment       True
stars        False
verified     False
date         False
country      False
helpful      False
has-media    False
dtype: bool

In [7]:
# Find reviews with missing comments.
missing_indices = df[df['comment'].isnull()].index.tolist()
print('Number of reviews missing comments: ', len(missing_indices))
print('Missing indices: ', missing_indices)

Number of reviews missing comments:  26
Missing indices:  [24835, 37237, 37277, 40072, 50852, 64895, 69760, 80854, 81562, 84103, 86420, 92670, 98539, 98769, 105389, 105405, 112139, 121481, 128082, 129823, 161838, 163097, 166147, 166321, 166371, 169934]


In [8]:
print('Max comment length (of all products): ', int(df.comment.str.len().max()))

Max comment length (of all products):  5127


In [9]:
# Get only the comments and star (labels) data.
df_comments_stars = df[['comment', 'stars']]
df_comments_stars.dropna(inplace=True)
df_comments_stars

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,comment,stars
0,I could sit here and write all about the specs...,5
1,A very reasonably priced laptop for basic comp...,4
2,"This is the best laptop deal you can get, full...",5
3,A few months after the purchase....It is still...,5
4,BUYER BE AWARE: This computer has Microsoft 10...,1
...,...,...
195786,I have not tried this camera without the SD ca...,5
195787,"Hello, I bought this item months ago and I tho...",1
195788,This is an incredible camera for the money!! ...,5
195789,Great cameras. Purchased some for my mother af...,5


In [10]:
# Save the large dataset to a CSV file.
df.to_csv('reviews.csv')

# Save the dataset with comments and stars.
df_comments_stars.to_csv('reviews_comments_stars.csv')