#Import all libraries

In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os
!pip install opendatasets -q
import opendatasets as od

# Download Data

In [15]:
#Check if dataset is present. if not download
if 'amazon-product-reviews' not in os.listdir():
  od.download('https://www.kaggle.com/datasets/irvifa/amazon-product-reviews')

# Load Data

In [31]:
#load the dataframe and set column name
df=pd.read_csv('amazon-product-reviews/ratings_Electronics.csv',names=['userId', 'productId','rating','timestamp'])
df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


# Data Exploration

In [32]:
print(df.shape) #~8M records
print(df.columns) #4 columns

(7824482, 4)
Index(['userId', 'productId', 'rating', 'timestamp'], dtype='object')


### Take 20% of the whole dataset as a sample to ease computation

In [33]:
#Calculate 20%
print("Number of records in 20% of the dataset should be:")
round(len(df) * 0.2,0)

Number of records in 20% of the dataset should be:


1564896.0

In [34]:
#Take the sample
sample_df = df.sample(n=1564896,ignore_index=True)

#delete actual df to free up memory
del df

In [37]:
sample_df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,A2O9TZ0UB9HO0E,B000069K98,1.0,1053907200
1,A2WBYB6DGU6W9O,B003UOIMBW,4.0,1354406400
2,A13WOT3RSXKRD5,B006RG0QC8,4.0,1333497600
3,A1P7XWKJYDMY65,B0002WTK4S,3.0,1205020800
4,AU056TNGEF3K0,B002UHADYY,4.0,1394928000


In [38]:
#We don't need the timestamp column so remove that
sample_df.drop('timestamp', axis=1, inplace = True)
sample_df.head()

Unnamed: 0,userId,productId,rating
0,A2O9TZ0UB9HO0E,B000069K98,1.0
1,A2WBYB6DGU6W9O,B003UOIMBW,4.0
2,A13WOT3RSXKRD5,B006RG0QC8,4.0
3,A1P7XWKJYDMY65,B0002WTK4S,3.0
4,AU056TNGEF3K0,B002UHADYY,4.0


In [51]:
#get some summary statistics about the rating column in the dataset
from statistics import mean

print("min rating - ", min(sample_df['rating']))
print("average rating - ", round(mean(sample_df['rating']),0))
print("max rating - ", max(sample_df['rating']))

min rating -  1.0
average rating -  4.0
max rating -  5.0


In [54]:
#Check missing values
sample_df.isnull().sum()

userId       0
productId    0
rating       0
dtype: int64

In [58]:
#some more statistics
print("unique number of users: ",sample_df['userId'].nunique())
print("unique number of products: ", sample_df['productId'].nunique())

unique number of users:  1225512
unique number of products:  237391


In [59]:
sample_df.head()

Unnamed: 0,userId,productId,rating
0,A2O9TZ0UB9HO0E,B000069K98,1.0
1,A2WBYB6DGU6W9O,B003UOIMBW,4.0
2,A13WOT3RSXKRD5,B006RG0QC8,4.0
3,A1P7XWKJYDMY65,B0002WTK4S,3.0
4,AU056TNGEF3K0,B002UHADYY,4.0


In [65]:
#Number of products rated by each user
number_of_products_per_user = sample_df.groupby(['userId'])['productId'].count().sort_values(ascending = False)
number_of_products_per_user.head()

userId
ADLVFFE4VBT8      116
A5JLAU2ARJ0BO     105
A3OXHLG6DIBRW8     99
A680RUE1FDO8B      88
A6FIAB28IS79       81
Name: productId, dtype: int64