In [1]:
import pandas as pd
import numpy as np
import re
from google.colab import files

# 1. Upload nhiều file CSV cùng lúc
uploaded = files.upload()

# Hàm làm sạch comment
def clean_comment(x):
    if pd.isna(x) or str(x).strip() == '':
        return 'NULL'
    s = re.sub(r'[^0-9A-Za-zÀ-Ỹà-ỹ\u00C0-\u017F\s]', '', str(x))
    return s.strip() if s.strip() else 'NULL'

# ✅ Hàm làm sạch images: giữ dấu " quanh mỗi URL, không xuống dòng
def clean_image_urls(x):
    if x in ('NULL', '') or pd.isna(x):
        return '[]'
    s = re.sub(r"[\[\]'\"]", '', str(x))  # loại bỏ [, ], ', "
    parts = [f'"{u.strip()}"' for u in s.split(',') if u.strip()]
    return '[' + ', '.join(parts) + ']'

# Xử lý từng file
for fname in uploaded.keys():
    print(f"\n>>> Xử lý file: {fname}")

    # Đọc CSV
    df = pd.read_csv(fname, encoding='utf-8-sig')
    df.columns = df.columns.str.strip().str.lower()

    # Đảm bảo có cột images và comment
    for col in ['images', 'comment']:
        if col not in df.columns:
            df[col] = np.nan

    # Xử lý images ban đầu
    df['images'] = df['images'].replace(r'^\[\]$', 'NULL', regex=True)
    df['images'] = df['images'].fillna('NULL').replace(r'^\s*$', 'NULL', regex=True)

    # Clean comment
    df['comment'] = df['comment'].apply(clean_comment)

    # Chuẩn hóa created_at nếu có
    if 'created_at' in df.columns:
        df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \
                             .dt.strftime('%d/%m/%y')

    # Áp dụng hàm mới cho cột images
    df['images'] = df['images'].apply(clean_image_urls)

    # Tạo thêm hai flags
    df['has_image']   = df['images'].apply(lambda x: 1 if x not in ('[]', '') else 0)
    df['has_comment'] = df['comment'].apply(lambda x: 0 if x in ('NULL','',None) else 1)

    # Xuất CSV và JSON rồi download
    cleaned_csv = fname.replace('.csv', '_cleaned.csv')
    df.to_csv(cleaned_csv, index=False, encoding='utf-8-sig')
    print(f"   → Đã lưu CSV: {cleaned_csv}")
    files.download(cleaned_csv)

    json_fname = cleaned_csv.replace('.csv', '.json')
    df.to_json(json_fname, orient="records", force_ascii=False, indent=4)
    print(f"   → Đã lưu JSON: {json_fname}")
    files.download(json_fname)


Saving tiki_reviews_health&beauty.csv to tiki_reviews_health&beauty.csv
Saving tiki_reviews_sport.csv to tiki_reviews_sport.csv
Saving tiki_reviews_stationery.csv to tiki_reviews_stationery.csv
Saving tiki_reviews_food&drink.csv to tiki_reviews_food&drink.csv
Saving tiki_reviews_book.csv to tiki_reviews_book.csv

>>> Xử lý file: tiki_reviews_health&beauty.csv
   → Đã lưu CSV: tiki_reviews_health&beauty_cleaned.csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_health&beauty_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_sport.csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


   → Đã lưu CSV: tiki_reviews_sport_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_sport_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_stationery.csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


   → Đã lưu CSV: tiki_reviews_stationery_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_stationery_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_food&drink.csv
   → Đã lưu CSV: tiki_reviews_food&drink_cleaned.csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_food&drink_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_book.csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


   → Đã lưu CSV: tiki_reviews_book_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_book_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>