In [2]:
import pandas as pd
import numpy as np
import re
import json
from google.colab import files

# 1. Upload nhiều file CSV cùng lúc
uploaded = files.upload()

# Hàm làm sạch comment
def clean_comment(x):
    if pd.isna(x) or str(x).strip() == '':
        return 'NULL'
    s = re.sub(r'[^0-9A-Za-zÀ-Ỹà-ỹ\u00C0-\u017F\s]', '', str(x))
    return s.strip() if s.strip() else 'NULL'

# Hàm làm sạch images: trả về list Python
def clean_image_urls(x):
    if pd.isna(x) or str(x).strip() in ('', 'NULL', '[]'):
        return []
    s = re.sub(r"[\[\]'\"]", '', str(x))
    parts = [u.strip() for u in s.split(',') if u.strip()]
    return parts

for fname in uploaded.keys():
    print(f"\n>>> Xử lý file: {fname}")

    df = pd.read_csv(fname, encoding='utf-8-sig')
    df.columns = df.columns.str.strip().str.lower()
    for col in ['images', 'comment']:
        if col not in df.columns:
            df[col] = np.nan

    # Clean comment
    df['comment'] = df['comment'].apply(clean_comment)
    # Normalize created_at
    if 'created_at' in df.columns:
        df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \
                             .dt.strftime('%d/%m/%y')
    # Clean images
    df['images'] = df['images'].apply(clean_image_urls)

    # Flags
    df['has_image']   = df['images'].apply(lambda x: 1 if len(x) > 0 else 0)
    df['has_comment'] = df['comment'].apply(lambda x: 0 if x in ('NULL','',None) else 1)

    # --- Xuất CSV như trước (nếu cần) ---
    cleaned_csv = fname.replace('.csv', '_cleaned.csv')
    df_for_csv = df.copy()
    df_for_csv['images'] = df_for_csv['images'].apply(
        lambda lst: '[' + ', '.join(f'"{u}"' for u in lst) + ']'
    )
    df_for_csv.to_csv(cleaned_csv, index=False, encoding='utf-8-sig')
    print(f"   → Đã lưu CSV: {cleaned_csv}")
    files.download(cleaned_csv)

    # --- Xuất JSON không escape slash, và ép empty list trên 1 dòng ---
    json_fname = cleaned_csv.replace('.csv', '.json')
    records = df.to_dict(orient='records')
    json_str = json.dumps(records, ensure_ascii=False, indent=4)
    # 1) bỏ backslash escape của slash
    json_str = json_str.replace(r'\/', '/')
    # 2) ép tất cả `"images": [   ]` thành `"images": []`
    json_str = re.sub(r'"images":\s*\[\s*\]', '"images": []', json_str)
    with open(json_fname, 'w', encoding='utf-8') as f:
        f.write(json_str)

    print(f"   → Đã lưu JSON: {json_fname}")
    files.download(json_fname)


Saving tiki_reviews_book.csv to tiki_reviews_book (1).csv
Saving tiki_reviews_food&drink.csv to tiki_reviews_food&drink (1).csv
Saving tiki_reviews_health&beauty.csv to tiki_reviews_health&beauty (1).csv
Saving tiki_reviews_sport.csv to tiki_reviews_sport (1).csv
Saving tiki_reviews_stationery.csv to tiki_reviews_stationery (1).csv

>>> Xử lý file: tiki_reviews_book (1).csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


   → Đã lưu CSV: tiki_reviews_book (1)_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_book (1)_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_food&drink (1).csv
   → Đã lưu CSV: tiki_reviews_food&drink (1)_cleaned.csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_food&drink (1)_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_health&beauty (1).csv
   → Đã lưu CSV: tiki_reviews_health&beauty (1)_cleaned.csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_health&beauty (1)_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_sport (1).csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


   → Đã lưu CSV: tiki_reviews_sport (1)_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_sport (1)_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


>>> Xử lý file: tiki_reviews_stationery (1).csv


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce') \


   → Đã lưu CSV: tiki_reviews_stationery (1)_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

   → Đã lưu JSON: tiki_reviews_stationery (1)_cleaned.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>