In [1]:
import pandas as pd
from psycopg2.extras import execute_values
from db_connection import get_connection
import random
from datetime import datetime, timedelta

In [2]:
reviews_df = pd.read_csv('dataset/Books_rating.csv')

harry_potter_reviews = reviews_df[reviews_df['Title'].str.contains('Harry Potter', case=False, na=False)]

reviews_df = reviews_df[~reviews_df['Title'].str.contains('Harry Potter', case=False, na=False)]

# 抽取 10% 的其他 reviews
reviews_df = reviews_df.sample(frac=0.1, random_state=42)

# 合併兩個 DataFrame
reviews_df = pd.concat([harry_potter_reviews, reviews_df], ignore_index=True)

# 檢查結果
print(f"Harry Potter reviews: {len(harry_potter_reviews)}")
print(f"Other reviews: {len(reviews_df)}")

# 轉換 Unix Timestamp 到日期格式
reviews_df['review/time'] = pd.to_datetime(reviews_df['review/time'], unit='s')

# 隨機化時間（加上隨機的時、分、秒）
def randomize_time(date):
    random_hours = random.randint(0, 23)
    random_minutes = random.randint(0, 59)
    random_seconds = random.randint(0, 59)
    return date + timedelta(hours=random_hours, minutes=random_minutes, seconds=random_seconds)

# 套用隨機時間
reviews_df['review/time'] = reviews_df['review/time'].apply(randomize_time)
reviews_df.head()

Harry Potter reviews: 13309
Other reviews: 311978


Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,3551551685,Harry Potter und die Kammer des Schreckens,33.4,,,43/45,5.0,2000-04-03 05:24:48,J.K.Rowling did it again!,&quot;Harry Potter und die Kammer des Schrecke...
1,3551551685,Harry Potter und die Kammer des Schreckens,33.4,A1T80CL9Q3HKDK,A. Moeller,5/5,5.0,2007-08-23 03:39:39,A great way to increase German language skills.,I needed a book in German that would motivate ...
2,3551551685,Harry Potter und die Kammer des Schreckens,33.4,AVHQ9KLO0O6XI,Mikey Ramone,0/0,5.0,2012-12-28 07:36:48,Great German Version of &#34;Chamber of Secret...,Bought as a gift for a Secret Santa thing. I d...
3,3551551685,Harry Potter und die Kammer des Schreckens,33.4,A2O0U4DXE19S2Z,Sabela,0/0,5.0,2010-12-19 21:36:09,Harry Potter Made Better :),"Of course, you can't argue with the original B..."
4,3551551685,Harry Potter und die Kammer des Schreckens,33.4,A2VN7BFUP041E1,smurray13,1/8,2.0,2010-07-14 10:18:02,"Never again, supermoviedeals","lost package, slow delivery, unhelpful custome..."


In [3]:
reviews_df.columns = [
    'id', 'title', 'price', 'user_id_src', 'profile_name', 
    'review_helpfulness', 'review_score', 'review_time', 
    'review_summary', 'review_text'
]

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [4]:
reviews_df[['helpful_yes', 'helpful_total']] = reviews_df['review_helpfulness'].str.split('/', expand=True).fillna(0)
reviews_df['helpful_yes'] = reviews_df['helpful_yes'].astype(int)
reviews_df['helpful_total'] = reviews_df['helpful_total'].astype(int)
reviews_df['price'] = pd.to_numeric(reviews_df['price'], errors='coerce').fillna(0)
reviews_df.head()

Unnamed: 0,id,title,price,user_id_src,profile_name,review_helpfulness,review_score,review_time,review_summary,review_text,helpful_yes,helpful_total
0,3551551685,Harry Potter und die Kammer des Schreckens,33.4,,,43/45,5.0,2000-04-03 05:24:48,J.K.Rowling did it again!,&quot;Harry Potter und die Kammer des Schrecke...,43,45
1,3551551685,Harry Potter und die Kammer des Schreckens,33.4,A1T80CL9Q3HKDK,A. Moeller,5/5,5.0,2007-08-23 03:39:39,A great way to increase German language skills.,I needed a book in German that would motivate ...,5,5
2,3551551685,Harry Potter und die Kammer des Schreckens,33.4,AVHQ9KLO0O6XI,Mikey Ramone,0/0,5.0,2012-12-28 07:36:48,Great German Version of &#34;Chamber of Secret...,Bought as a gift for a Secret Santa thing. I d...,0,0
3,3551551685,Harry Potter und die Kammer des Schreckens,33.4,A2O0U4DXE19S2Z,Sabela,0/0,5.0,2010-12-19 21:36:09,Harry Potter Made Better :),"Of course, you can't argue with the original B...",0,0
4,3551551685,Harry Potter und die Kammer des Schreckens,33.4,A2VN7BFUP041E1,smurray13,1/8,2.0,2010-07-14 10:18:02,"Never again, supermoviedeals","lost package, slow delivery, unhelpful custome...",1,8


In [5]:
with get_connection() as conn:
    with conn.cursor() as cur:
        cur.execute("SELECT title, book_id FROM books;")
        book_map = {title: book_id for title, book_id in cur.fetchall()}

In [6]:
review_values = []
for _, row in reviews_df.iterrows():
    book_id = book_map.get(row['title'])
    if not book_id:
        print(f"找不到對應的書籍: {row['title']}，跳過...")
        continue

    review_values.append((
        book_id,                             # 書籍ID
        None,                                # 使用者ID，因為是外部來源
        True,                                # 外部評論
        "external",                          # 來源平台 (external)
        row['user_id_src'],                  # 外部平台使用者 ID
        row['profile_name'],                 # 使用者名稱
        row['review_score'],                 # 評分
        row['price'],                        # 價格
        row['helpful_yes'],                  # 有幫助的票數
        row['helpful_total'],                # 總票數
        row['review_time'],                  # 評論時間
        row['review_summary'],               # 評論摘要
        row['review_text']                   # 評論內容
    ))

找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...
找不到對應的書籍: nan，跳過...


In [7]:
insert_reviews = """
    INSERT INTO reviews (
        book_id, user_id, is_external, source, 
        user_id_src, profile_name, rating, price, 
        helpful_yes, helpful_total, review_time, summary, content
    ) VALUES %s;
"""

In [8]:
batch_size = 1000  # 每次 1000 筆
total_records = len(review_values)

for i in range(0, total_records, batch_size):
    batch = review_values[i:i + batch_size]
    with get_connection() as conn:
        with conn.cursor() as cur:
            execute_values(cur, insert_reviews, batch)
            conn.commit()
    print(f"Inserted batch {i} - {i + batch_size}")

Inserted batch 0 - 1000
Inserted batch 1000 - 2000
Inserted batch 2000 - 3000
Inserted batch 3000 - 4000
Inserted batch 4000 - 5000
Inserted batch 5000 - 6000
Inserted batch 6000 - 7000
Inserted batch 7000 - 8000
Inserted batch 8000 - 9000
Inserted batch 9000 - 10000
Inserted batch 10000 - 11000
Inserted batch 11000 - 12000
Inserted batch 12000 - 13000
Inserted batch 13000 - 14000
Inserted batch 14000 - 15000
Inserted batch 15000 - 16000
Inserted batch 16000 - 17000
Inserted batch 17000 - 18000
Inserted batch 18000 - 19000
Inserted batch 19000 - 20000
Inserted batch 20000 - 21000
Inserted batch 21000 - 22000
Inserted batch 22000 - 23000
Inserted batch 23000 - 24000
Inserted batch 24000 - 25000
Inserted batch 25000 - 26000
Inserted batch 26000 - 27000
Inserted batch 27000 - 28000
Inserted batch 28000 - 29000
Inserted batch 29000 - 30000
Inserted batch 30000 - 31000
Inserted batch 31000 - 32000
Inserted batch 32000 - 33000
Inserted batch 33000 - 34000
Inserted batch 34000 - 35000
Inserte