In [2]:
import pandas as pd 
import numpy as np 
import gc

# Amazon

**Ref:** https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews

## Books data

In [3]:
path_amazon = '../Dataset/archive/'

In [4]:
amz_data_df = pd.read_parquet(path_amazon + 'books_data.parquet')

In [5]:
print(amz_data_df.columns, '\n')
print(amz_data_df.info(), '\n')
print(amz_data_df.isna().sum(), '\n')
amz_data_df.head() 

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'publishedDate', 'infoLink', 'categories', 'ratingsCount'],
      dtype='object') 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB
None 

Title                 1
description       68442
authors           31413
image             52075
previewLink       23836

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [6]:
amz_data_df = amz_data_df[[
    'Title',
    'description',
    'authors',
    'publisher',
    'publishedDate', 
    'categories', 
    'ratingsCount'
    ]]

print(amz_data_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   publisher      136518 non-null  object 
 4   publishedDate  187099 non-null  object 
 5   categories     171205 non-null  object 
 6   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(6)
memory usage: 11.3+ MB
None


### Amazon Metadata preprocessing

In [7]:
print(amz_data_df.isnull().sum())


Title                 1
description       68442
authors           31413
publisher         75886
publishedDate     25305
categories        41199
ratingsCount     162652
dtype: int64


In [8]:
import pandas as pd
import numpy as np

# Hàm làm sạch chuỗi thông thường (Authors/Categories)
def clean_formatted_string(text):
    if pd.isna(text) or str(text) == 'Unknown': 
        return 'Unknown'
    text = str(text)
    text = text.replace('[', '').replace(']', '').replace("'", "").replace('"', "")
    text = text.replace(" & ", ", ") 
    return text.strip()

def preprocessing_data(amz_data_df): 
    df_clean = amz_data_df.copy()
    
    # --- 1. Xử lý Thời gian (PHƯƠNG PHÁP MỚI: REGEX) ---
    if 'publishedDate' in df_clean.columns: 
        # Chuyển cột sang chuỗi
        date_str = df_clean['publishedDate'].astype(str)
        
        # Dùng Regex để tìm 4 chữ số liên tiếp đầu tiên (ví dụ: tìm "2005" trong "2005-01-01")
        # r'(\d{4})' nghĩa là: tìm cụm 4 chữ số
        extracted_year = date_str.str.extract(r'(\d{4})', expand=False)
        
        # Điền 0 nếu không tìm thấy, sau đó ép kiểu số nguyên
        df_clean['published_year'] = extracted_year.fillna(0).astype(int)
        
        # Xóa cột cũ
        df_clean = df_clean.drop(columns=['publishedDate'])

    # --- 2. Xử lý Missing Values --- 
    if 'ratingsCount' in df_clean.columns: 
        df_clean['ratingsCount'] = df_clean['ratingsCount'].fillna(0)
    
    # Điền Unknown cho các cột chuỗi (lưu ý publishedDate đã bị xóa nên không bị ảnh hưởng)
    string_cols = df_clean.select_dtypes(include=['object', 'string']).columns
    for col in string_cols: 
        df_clean[col] = df_clean[col].fillna('Unknown')

    # --- 3. Authors ---
    if 'authors' in df_clean.columns:
        df_clean['authors'] = df_clean['authors'].apply(clean_formatted_string)

    # --- 4. Categories ---
    if 'categories' in df_clean.columns: 
        df_clean['categories'] = df_clean['categories'].apply(clean_formatted_string)
        df_clean['main_category'] = df_clean['categories'].apply(
            lambda x: x.split(',')[0].strip() if x != 'Unknown' and x != '' else "Uncategorized"
        )
        
    # --- 5. Clean Text & Deduplicate ---
    final_string_cols = df_clean.select_dtypes(include=['object', 'string']).columns
    for col in final_string_cols: 
        df_clean[col] = df_clean[col].astype(str).str.strip()
        
    df_clean = df_clean.drop_duplicates()
    
    return df_clean

df_processed = preprocessing_data(amz_data_df)

In [9]:
df_processed.head(10)

Unnamed: 0,Title,description,authors,publisher,categories,ratingsCount,published_year,main_category
0,Its Only Art If Its Well Hung!,Unknown,Julie Strain,Unknown,"Comics, Graphic Novels",0.0,1996,Comics
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,Philip Nel,A&C Black,"Biography, Autobiography",0.0,2005,Biography
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,David R. Ray,Unknown,Religion,0.0,2000,Religion
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,Veronica Haddon,iUniverse,Fiction,0.0,2005,Fiction
4,"Nation Dance: Religion, Identity and Cultural ...",Unknown,Edward Long,Unknown,Unknown,0.0,2003,Uncategorized
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,Wm. B. Eerdmans Publishing,Religion,5.0,1996,Religion
6,The Overbury affair (Avon),Unknown,Miriam Allen De Ford,Unknown,Unknown,0.0,1960,Uncategorized
7,A Walk in the Woods: a Play in Two Acts,Unknown,Lee Blessing,Unknown,Unknown,3.0,1988,Uncategorized
8,Saint Hyacinth of Poland,The story for children 10 and up of St. Hyacin...,Mary Fabyan Windeatt,Tan Books & Pub,"Biography, Autobiography",0.0,2009,Biography
9,Rising Sons and Daughters: Life Among Japan's ...,Wardell recalls his experience as a foreign st...,Steven Wardell,Plympton PressIntl,Social Science,0.0,1995,Social Science


# Books_rating.parquet

In [24]:
amz_rt_df = pd.read_parquet(path_amazon + 'books_rating.parquet')

In [25]:
print(amz_rt_df.columns,'\n')
print(amz_rt_df.info(), '\n')
print(amz_rt_df.isna().sum(), '\n')
amz_rt_df.head()

Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
       'review/score', 'review/time', 'review/summary', 'review/text'],
      dtype='object') 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB
None 

Id                          0
Title                     208
Price                 2518829
User_id                561787
profileName            561905
review/helpfulness          0
review/score                0
review/time                 0
revie

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [26]:
amz_rt_df = amz_rt_df[['Title', 'User_id', 'Price', 'review/helpfulness', 'review/score', 'review/summary', 'review/text']]

In [27]:
print(amz_rt_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 7 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Title               object 
 1   User_id             object 
 2   Price               float64
 3   review/helpfulness  object 
 4   review/score        float64
 5   review/summary      object 
 6   review/text         object 
dtypes: float64(2), object(5)
memory usage: 160.2+ MB
None


In [28]:
print(amz_rt_df.isnull().sum())

Title                     208
User_id                561787
Price                 2518829
review/helpfulness          0
review/score                0
review/summary            407
review/text                 8
dtype: int64


In [29]:
amz_rt_df.head(20)

Unnamed: 0,Title,User_id,Price,review/helpfulness,review/score,review/summary,review/text
0,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,,7/7,4.0,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,Dr. Seuss: American Icon,A30TK6U7DNS82R,,10/10,5.0,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,,10/11,5.0,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,Dr. Seuss: American Icon,A2MVUWT453QH61,,7/7,4.0,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Dr. Seuss: American Icon,A22X4XUPKF66MR,,3/3,4.0,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...
5,Dr. Seuss: American Icon,A2F6NONFUDB6UK,,2/2,4.0,One of America's greatest creative talents,"""Dr. Seuss: American Icon"" by Philip Nel is a ..."
6,Dr. Seuss: American Icon,A14OJS0VWMOSWO,,3/4,5.0,A memorably excellent survey of Dr. Seuss' man...,Theodor Seuss Giesel was best known as 'Dr. Se...
7,Dr. Seuss: American Icon,A2RSSXTDZDUSH4,,0/0,5.0,Academia At It's Best,When I recieved this book as a gift for Christ...
8,Dr. Seuss: American Icon,A25MD5I2GUIW6W,,0/0,5.0,And to think that I read it on the tram!,Trams (or any public transport) are not usuall...
9,Dr. Seuss: American Icon,A3VA4XFS5WNJO3,,3/5,4.0,Fascinating account of a genius at work,"As far as I am aware, this is the first book-l..."


In [33]:
import html  # Thư viện quan trọng để xử lý lỗi font HTML (ví dụ: &quot;)

def clean_review_text(text):
    """Làm sạch văn bản review và giải mã HTML"""
    if pd.isna(text) or str(text) == 'nan' or str(text) == '':
        return 'Unknown'
    text = str(text)
    # Giải mã HTML entities: &quot; -> ", &amp; -> &
    text = html.unescape(text)
    return text.strip()

def preprocess_reviews_data(df_reviews):
    # Sao chép để không ảnh hưởng dữ liệu gốc
    df_clean = df_reviews.copy()

    # --- 1. Xử lý Missing Values cơ bản ---
    # Price: Điền 0 cho giá thiếu
    if 'Price' in df_clean.columns:
        df_clean['Price'] = pd.to_numeric(df_clean['Price'], errors='coerce').fillna(0.0)

    # review/score: Điền 0 hoặc trung bình (ở đây chọn 0 để an toàn)
    if 'review/score' in df_clean.columns:
        df_clean['review/score'] = pd.to_numeric(df_clean['review/score'], errors='coerce').fillna(0.0)

    # --- 2. Xử lý Helpfulness (Quan trọng) ---
    # Dữ liệu dạng "7/7", "2/4" -> Cần tách ra để máy học hiểu được
    if 'review/helpfulness' in df_clean.columns:
        # Tách chuỗi bằng dấu '/'
        # expand=True sẽ tạo ra dataframe 2 cột
        helpfulness_split = df_clean['review/helpfulness'].astype(str).str.split('/', expand=True)
        
        # Cột 0 là số vote hữu ích (numerator), Cột 1 là tổng vote (denominator)
        # Dùng to_numeric để ép kiểu, fillna(0) để xử lý lỗi
        df_clean['helpful_votes'] = pd.to_numeric(helpfulness_split[0], errors='coerce').fillna(0).astype(int)
        df_clean['total_votes'] = pd.to_numeric(helpfulness_split[1], errors='coerce').fillna(0).astype(int)
        
        # (Tùy chọn) Tính tỷ lệ phần trăm hữu ích (tránh chia cho 0)
        df_clean['helpfulness_ratio'] = df_clean.apply(
            lambda x: x['helpful_votes'] / x['total_votes'] if x['total_votes'] > 0 else 0.0, axis=1
        )

    # --- 3. Xử lý Thời gian (review/time) ---
    # Dữ liệu Amazon thường lưu time dưới dạng Unix Timestamp (số giây từ năm 1970)
    if 'review/time' in df_clean.columns:
        # unit='s' là giây. Nếu số quá lớn thì thử unit='ms'
        df_clean['review_date'] = pd.to_datetime(df_clean['review/time'], unit='s', errors='coerce')
        df_clean['review_year'] = df_clean['review_date'].dt.year.fillna(0).astype(int)

    # --- 4. Làm sạch Text & Title (HTML Decoding) ---
    text_cols = ['Title', 'review/summary', 'review/text', 'profileName']
    for col in text_cols:
        if col in df_clean.columns:
            df_clean[col] = df_clean[col].apply(clean_review_text)

    # --- 5. Xóa dòng trùng lặp hoàn toàn ---
    df_clean = df_clean.drop_duplicates()

    return df_clean

# --- Chạy thử nghiệm giả lập (Code Test) ---
amz_rt_df_processed = preprocess_reviews_data(amz_rt_df)


In [34]:
# Kiểm tra kết quả
print(amz_rt_df_processed.head())

                            Title         User_id  Price review/helpfulness  \
0  Its Only Art If Its Well Hung!   AVCGYZL8FQQTD    0.0                7/7   
1        Dr. Seuss: American Icon  A30TK6U7DNS82R    0.0              10/10   
2        Dr. Seuss: American Icon  A3UH4UZ4RSVO82    0.0              10/11   
3        Dr. Seuss: American Icon  A2MVUWT453QH61    0.0                7/7   
4        Dr. Seuss: American Icon  A22X4XUPKF66MR    0.0                3/3   

   review/score                                   review/summary  \
0           4.0           Nice collection of Julie Strain images   
1           5.0                                Really Enjoyed It   
2           5.0  Essential for every personal and Public Library   
3           4.0  Phlip Nel gives silly Seuss a serious treatment   
4           4.0                           Good academic overview   

                                         review/text  helpful_votes  \
0  This is only for Julie Strain fans. It's a

In [35]:
def merge_and_normalize(df_reviews, df_books):
    # --- BƯỚC 1: Chuẩn hóa cột Title (Khóa nối) ---
    # Tạo bản sao để không ảnh hưởng dữ liệu gốc
    reviews_final = df_reviews.copy()
    books_final = df_books.copy()
    
    # Hàm làm sạch Title cơ bản: chuyển về chữ thường, xóa khoảng trắng thừa
    def normalize_title(text):
        if pd.isna(text): return ""
        return str(text).lower().strip()
    
    print("Đang chuẩn hóa Title...")
    reviews_final['Title_clean'] = reviews_final['Title'].apply(normalize_title)
    books_final['Title_clean'] = books_final['Title'].apply(normalize_title)
    
    # --- BƯỚC 2: Xử lý trùng lặp ở bảng Books ---
    # Chúng ta chỉ muốn ghép metadata vào review, nên bảng Books phải là duy nhất theo Title
    # Giữ lại dòng đầu tiên, bỏ các dòng trùng Title sau đó
    books_unique = books_final.drop_duplicates(subset=['Title_clean'], keep='first')
    
    print(f"Số lượng sách gốc: {len(books_final)}")
    print(f"Số lượng sách sau khi loại bỏ trùng lặp Title: {len(books_unique)}")
    
    # --- BƯỚC 3: Gộp bảng (Merge) ---
    # Sử dụng LEFT JOIN: Ưu tiên giữ lại toàn bộ Review, 
    # nếu sách nào tìm thấy thông tin trong books_data thì điền vào, không thấy thì để NaN
    merged_df = pd.merge(
        reviews_final, 
        books_unique, 
        on='Title_clean', 
        how='left',
        suffixes=('_review', '_book') # Xử lý nếu có cột trùng tên (ví dụ Price)
    )
    
    # --- BƯỚC 4: Dọn dẹp sau khi gộp ---
    # Xóa cột Title_clean dùng để nối (giữ lại Title gốc của bảng Review cho đẹp)
    # Và xóa cột Title của bảng book nếu nó bị thừa
    if 'Title_book' in merged_df.columns:
        merged_df = merged_df.drop(columns=['Title_clean', 'Title_book'])
    else:
        merged_df = merged_df.drop(columns=['Title_clean'])
        
    # Đổi tên Title_review lại thành Title cho chuẩn
    merged_df = merged_df.rename(columns={'Title_review': 'Title'})

    return merged_df

# --- GIẢ ĐỊNH ---
# Giả sử bạn đã có 2 dataframe từ các bước trước:
# 1. df_processed: Dataframe thông tin sách (đã có published_year, authors...)
# 2. amz_rt_df_processed: Dataframe review (đã tách helpfullness, time...)

# Thực hiện gộp
final_df = merge_and_normalize(amz_rt_df_processed, df_processed)

# Kiểm tra kết quả
print("\n--- KẾT QUẢ GỘP ---")
print(f"Kích thước bảng sau gộp: {final_df.shape}")
print(final_df.head())

# Kiểm tra xem có bao nhiêu review tìm được thông tin sách tương ứng
matches = final_df['published_year'].notna().sum()
print(f"\nSố lượng review khớp được với thông tin sách: {matches} / {len(final_df)}")

Đang chuẩn hóa Title...
Số lượng sách gốc: 212404
Số lượng sách sau khi loại bỏ trùng lặp Title: 209457

--- KẾT QUẢ GỘP ---
Kích thước bảng sau gộp: (2666379, 17)
                            Title         User_id  Price review/helpfulness  \
0  Its Only Art If Its Well Hung!   AVCGYZL8FQQTD    0.0                7/7   
1        Dr. Seuss: American Icon  A30TK6U7DNS82R    0.0              10/10   
2        Dr. Seuss: American Icon  A3UH4UZ4RSVO82    0.0              10/11   
3        Dr. Seuss: American Icon  A2MVUWT453QH61    0.0                7/7   
4        Dr. Seuss: American Icon  A22X4XUPKF66MR    0.0                3/3   

   review/score                                   review/summary  \
0           4.0           Nice collection of Julie Strain images   
1           5.0                                Really Enjoyed It   
2           5.0  Essential for every personal and Public Library   
3           4.0  Phlip Nel gives silly Seuss a serious treatment   
4           4.0      