# #0 Table of Content
1. Import packages and data
2. Data correction

# #1 Import packages, data and creating functions

In [3]:
import pandas as pd
import uuid

In [4]:
product_table  = pd.read_csv("../asset/newProduct.csv")
review_table   = pd.read_csv("../asset/newReview.csv")
merchant_table = pd.read_csv("../asset/newMerchant.csv")

In [5]:
def prefix_fix(x):
    if 'k' in x or 'K' in x:
        return int(float(x[:-1]) * 1000)
        
    elif 'm' in x or 'M' in x:
        return int(float(x[:-1]) * 1000000)

    else:
        return int(x)

def product_totalSold_fix(x):
    if 'k' in x or 'K' in x:
        return int(float(x[:-1]) * 1000)
        
    elif 'm' in x or 'M' in x:
        return int(float(x[:-1]) * 1000000)

    else:
        return int(x)

def product_price_fix(x):
    if '-' in x:
        x = x.split(' - ')[0]

    x = x.strip()[1:]
    
    return float(x)

def product_qtyAvail_fix(x):
    x = x.split(' ')[0]
    return int(x)

def product_favCount_fix(x):
    x = x.split(' ')[-1]
    x = x[1:-1]
    return prefix_fix(x)

def review_location_fix(x):
    if '-' not in x:
        return x

def review_date_fix(x):
    if '-' in x:
        return pd.to_datetime(x)
    else:
        return None

def merchant_responseRatePercent_fix(x):
    return float(x[:-1])

def merchant_days_fix(x):
    num  = x.split(' ')[0]
    unit = x.split(' ')[1]

    match unit:
        case "years":
            return int(num) * 365
        case "months":
            return int(num) * 30
        case "weeks":
            return int(num) * 7
        case "days":
            return int(num)
        case _:
            return None


# #2 Data Correction

## #2.1 Product Table

In [4]:
# PRODUCT
print(product_table.shape)
display(product_table.head(3))
print(product_table.info())

(38, 13)


Unnamed: 0,product_id,merchant_id,name,preferred,mall,avg_rating,total_rating,total_sold,price,qty_avail,fav_count,description,img_src
0,912d838b-5e84-4aef-a0dd-bb23f44e5913,983b8576-0dd8-4c84-acdf-0915734adbd0,SPRISE Premium Wireless Bluetooth Earphone Col...,1,0,4.6,256,840,$17.39 - $18.78,459 pieces available,Favorite (1.1k),Hey~ Welcome to SPRISE Official Store! Please ...,https://down-sg.img.susercontent.com/file/my-1...
1,34bb8bfe-e134-41af-a487-3b8d4ce6fff0,6e3b6c48-c873-44ac-b99a-845f35a30cf7,Rock Space [SG] O2 Wireless Headphone Bluetoot...,1,0,4.9,369,955,$35.80,164 pieces available,Favorite (1.8k),SKU: 2856 Name: Rock Space 02 Wireless Headpho...,https://down-sg.img.susercontent.com/file/sg-1...
2,35ff0f3c-8a06-4e33-a789-319ff793220d,90f139bf-4bb4-4f5c-8511-f99ede1d71b2,Edifier W820NB/W820NB PLUS Wireless Headphone ...,1,0,4.8,88,259,$79.99 - $99.00,329 pieces available,Favorite (466),EDIFIER W820NB PLUS NOISE CANCELING ACTIVE NOI...,https://down-sg.img.susercontent.com/file/cn-1...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    38 non-null     object 
 1   merchant_id   38 non-null     object 
 2   name          38 non-null     object 
 3   preferred     38 non-null     int64  
 4   mall          38 non-null     int64  
 5   avg_rating    38 non-null     float64
 6   total_rating  38 non-null     object 
 7   total_sold    38 non-null     object 
 8   price         38 non-null     object 
 9   qty_avail     38 non-null     object 
 10  fav_count     38 non-null     object 
 11  description   37 non-null     object 
 12  img_src       9 non-null      object 
dtypes: float64(1), int64(2), object(10)
memory usage: 4.0+ KB
None


In [14]:
if 'preferred' in product_table.columns:
    product_table = product_table.drop('preferred', axis=1)

if 'mall' in product_table.columns:
    product_table = product_table.drop('mall', axis=1)

if product_table['total_rating'].dtype != 'int64':
    product_table['total_rating'] = product_table['total_rating'].apply(prefix_fix)

if product_table['total_sold'].dtype != 'int64':
    product_table['total_sold'] = product_table['total_sold'].apply(prefix_fix)

if product_table['price'].dtype != 'float64':
    product_table['price'] = product_table['price'].apply(product_price_fix)

if product_table['qty_avail'].dtype != 'int64':
    product_table['qty_avail'] = product_table['qty_avail'].fillna('0')
    product_table['qty_avail'] = product_table['qty_avail'].apply(product_qtyAvail_fix)

if product_table['fav_count'].dtype != 'int64':
    product_table['fav_count'] = product_table['fav_count'].apply(product_favCount_fix)

print('total_rating : ', product_table['total_rating'].dtype)
print('total_sold    :', product_table['total_sold'].dtype)
print('price         :', product_table['price'].dtype)
print('qty_avail     :', product_table['qty_avail'].dtype)
print('fav_count     :', product_table['fav_count'].dtype)
display(product_table[['total_sold','total_rating','price','qty_avail','fav_count']].head(3))

display
print(product_table.info())

total_rating :  int64
total_sold    : int64
price         : float64
qty_avail     : int64
fav_count     : int64


Unnamed: 0,total_sold,total_rating,price,qty_avail,fav_count
0,840,256,17.39,459,1100
1,955,369,35.8,164,1800
2,259,88,79.99,329,466


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_id    38 non-null     object 
 1   merchant_id   38 non-null     object 
 2   name          38 non-null     object 
 3   avg_rating    38 non-null     float64
 4   total_rating  38 non-null     int64  
 5   total_sold    38 non-null     int64  
 6   price         38 non-null     float64
 7   qty_avail     38 non-null     int64  
 8   fav_count     38 non-null     int64  
 9   description   37 non-null     object 
 10  img_src       9 non-null      object 
dtypes: float64(2), int64(4), object(5)
memory usage: 3.4+ KB
None


## #2.2 Review Table

In [6]:
# REVIEW
print(review_table.shape)
display(review_table.head(3))
print(review_table.info())

(11478, 7)


Unnamed: 0,review_id,username,merchant_id,product_id,date,rating,content
0,6ef347c0-c603-422a-b8d1-c4b96bed0207,i*****b,983b8576-0dd8-4c84-acdf-0915734adbd0,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-09-02 01:13,5,Best buy ever\nit looks great works great\nthe...
1,af3ccbed-1865-4492-88e3-723e9dda0de9,jessylim70,983b8576-0dd8-4c84-acdf-0915734adbd0,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-07-21 23:33,5,Item received in good condition.\nBought durin...
2,d17ebe72-919d-4c2a-a230-88119aac725c,s*****b,983b8576-0dd8-4c84-acdf-0915734adbd0,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-05-08 19:37,5,Value For Money: yes\nBest Feature(s): comfort...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11478 entries, 0 to 11477
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    11478 non-null  object
 1   username     11478 non-null  object
 2   merchant_id  11478 non-null  object
 3   product_id   11478 non-null  object
 4   date         11478 non-null  object
 5   rating       11478 non-null  int64 
 6   content      5140 non-null   object
dtypes: int64(1), object(6)
memory usage: 627.8+ KB
None


In [7]:
if 'merchant_id' in review_table.columns:
    review_table = review_table.drop('merchant_id', axis=1)

review_table['location'] = review_table['date'].apply(review_location_fix)
review_table['date'] = review_table['date'].apply(review_date_fix)

In [8]:
print(review_table.shape)
display(review_table.head(3))
print(review_table.info())

(11478, 7)


Unnamed: 0,review_id,username,product_id,date,rating,content,location
0,6ef347c0-c603-422a-b8d1-c4b96bed0207,i*****b,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-09-02 01:13:00,5,Best buy ever\nit looks great works great\nthe...,
1,af3ccbed-1865-4492-88e3-723e9dda0de9,jessylim70,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-07-21 23:33:00,5,Item received in good condition.\nBought durin...,
2,d17ebe72-919d-4c2a-a230-88119aac725c,s*****b,912d838b-5e84-4aef-a0dd-bb23f44e5913,2023-05-08 19:37:00,5,Value For Money: yes\nBest Feature(s): comfort...,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11478 entries, 0 to 11477
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   review_id   11478 non-null  object        
 1   username    11478 non-null  object        
 2   product_id  11478 non-null  object        
 3   date        8896 non-null   datetime64[ns]
 4   rating      11478 non-null  int64         
 5   content     5140 non-null   object        
 6   location    2582 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 627.8+ KB
None


## #2.3 Merchant Table

In [9]:
# MERCHANT
print(merchant_table.shape)
display(merchant_table.head(3))
print(merchant_table.info())

(38, 8)


Unnamed: 0,merchant_id,name,Ratings,response rate,joined,products,response time,follower
0,983b8576-0dd8-4c84-acdf-0915734adbd0,sprise_localstore.sg,809,96%,7 months ago,12,within hours,630
1,6e3b6c48-c873-44ac-b99a-845f35a30cf7,IN-BOX,24.2k,97%,8 years ago,366,within hours,11.5k
2,90f139bf-4bb4-4f5c-8511-f99ede1d71b2,Edifier Flagship Store,367,97%,7 months ago,24,within hours,2.5k


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   merchant_id    38 non-null     object
 1   name           38 non-null     object
 2   Ratings        38 non-null     object
 3   response rate  38 non-null     object
 4   joined         38 non-null     object
 5   products       38 non-null     object
 6   response time  38 non-null     object
 7   follower       38 non-null     object
dtypes: object(8)
memory usage: 2.5+ KB
None


In [10]:
rename_dict = {
    'Ratings'       : 'total_rating',
    'response rate' : 'response_rate_percent',
    'joined'        : 'days',
    'products'      : 'no_products',
    'responses time': 'response_speed',
    'follower'      : 'no_follower'
}

merchant_table = merchant_table.rename(columns=rename_dict)

In [11]:
if merchant_table['total_rating'].dtype != 'int64':
    merchant_table['total_rating'] = merchant_table['total_rating'].apply(prefix_fix)

if merchant_table['response_rate_percent'].dtype != 'float64':
    merchant_table['response_rate_percent'] = merchant_table['response_rate_percent'].apply(merchant_responseRatePercent_fix)

if merchant_table['days'].dtype != 'int64':
    merchant_table['days'] = merchant_table['days'].apply(merchant_days_fix)

if merchant_table['no_products'].dtype != 'int64':
    merchant_table['no_products'] = merchant_table['no_products'].apply(prefix_fix)

if merchant_table['no_follower'].dtype != 'int64':
    merchant_table['no_follower'] = merchant_table['no_follower'].apply(prefix_fix)

print('total_rating          : ', merchant_table['total_rating'].dtype)
print('response_rate_percent : ', merchant_table['response_rate_percent'].dtype)
print('days                  : ', merchant_table['days'].dtype)
print('no_products           : ', merchant_table['no_products'].dtype)
print('no_follower           : ', merchant_table['no_follower'].dtype)
display(merchant_table[['total_rating','response_rate_percent','days','no_products','no_follower']].head(3))
print(merchant_table.info())

total_rating          :  int64
response_rate_percent :  float64
days                  :  int64
no_products           :  int64
no_follower           :  int64


Unnamed: 0,total_rating,response_rate_percent,days,no_products,no_follower
0,809,96.0,210,12,630
1,24200,97.0,2920,366,11500
2,367,97.0,210,24,2500


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   merchant_id            38 non-null     object 
 1   name                   38 non-null     object 
 2   total_rating           38 non-null     int64  
 3   response_rate_percent  38 non-null     float64
 4   days                   38 non-null     int64  
 5   no_products            38 non-null     int64  
 6   response time          38 non-null     object 
 7   no_follower            38 non-null     int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 2.5+ KB
None


## #2.4 User Table

In [77]:
aggregate = {
    "review_id"  : 'nunique',
    "product_id" : 'nunique'
}

rename_dict = {
    'review_id' :"no_review",
    'product_id':"no_product"
}

user_table = review_table.groupby('username').agg(aggregate).reset_index()
user_table = user_table.rename(columns=rename_dict)

display(user_table.head(3))

Unnamed: 0,username,no_review,no_product
0,.*****s,1,1
1,.....linh123,1,1
2,.4fiq,1,1


In [78]:
review_list_table = review_table.groupby('username')['review_id'].agg(list).reset_index()
display(review_list_table.head(3))

Unnamed: 0,username,review_id
0,.*****s,[9d8eae38-6228-42ad-920b-4fe4c4e78bbc]
1,.....linh123,[38139145-b628-4e4c-a740-00ed005bb9b4]
2,.4fiq,[62940eb8-ac26-4435-bf97-4810b4fdf71a]


In [79]:
def product_dict_fix(x):
    product_dict = {}
    for item in x:
        if item not in product_dict:
            product_dict[item] = 1
        else:
            product_dict[item] += 1
    return product_dict

product_dict_table = review_table.groupby('username')['product_id'].agg(list).reset_index()
product_dict_table['product_id'] = product_dict_table['product_id'].apply(product_dict_fix)
display(product_dict_table.head(3))

Unnamed: 0,username,product_id
0,.*****s,{'d4762088-f54e-4297-aefe-b7a5c2fc5b76': 1}
1,.....linh123,{'b14fb80e-204a-4007-8a22-8d467f9cbc36': 1}
2,.4fiq,{'e1366b54-47df-429c-b19e-c2173ac35fe7': 1}


In [80]:
user_table = user_table.merge(review_list_table, on='username', how='inner').merge(product_dict_table, on='username', how='inner')
display(user_table.head(3))

Unnamed: 0,username,no_review,no_product,review_id,product_id
0,.*****s,1,1,[9d8eae38-6228-42ad-920b-4fe4c4e78bbc],{'d4762088-f54e-4297-aefe-b7a5c2fc5b76': 1}
1,.....linh123,1,1,[38139145-b628-4e4c-a740-00ed005bb9b4],{'b14fb80e-204a-4007-8a22-8d467f9cbc36': 1}
2,.4fiq,1,1,[62940eb8-ac26-4435-bf97-4810b4fdf71a],{'e1366b54-47df-429c-b19e-c2173ac35fe7': 1}
