In [None]:
import pandas as pd, time, random
from concurrent.futures import ThreadPoolExecutor, as_completed
from deep_translator import GoogleTranslator
from pathlib import Path
import pandas as pd
import glob

order_review_wide = pd.read_csv("../../processed/order_review_wide.csv")
cond1 = order_review_wide['review_comment_message']!='U'
cond2 = order_review_wide['review_score']<=3
cond3 = order_review_wide['days'] > -1 
bad_review_data = order_review_wide[cond1 & cond2 & cond3].copy() 
bad_review_data.dropna(subset='bucket', inplace=True)


<class 'pandas.core.frame.DataFrame'>
Index: 12669 entries, 16 to 98878
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   review_id                12669 non-null  object
 1   order_id                 12669 non-null  object
 2   review_score             12669 non-null  int64 
 3   review_comment_title     12669 non-null  object
 4   review_comment_message   12669 non-null  object
 5   review_creation_date     12669 non-null  object
 6   review_answer_timestamp  12669 non-null  object
 7   days                     12669 non-null  int64 
 8   bucket                   12669 non-null  object
dtypes: int64(2), object(7)
memory usage: 989.8+ KB


In [None]:
SAVE_CSV = Path('/wordcloud_csv')
SAVE_CSV.mkdir(parents=True, exist_ok=True)

THREADS = 6
BATCH_SIZE = 1000
MAX_RETRY = 2

In [None]:
def safe_translate(text):
    for attempt in range(MAX_RETRY):
        try:
            translator = GoogleTranslator(source='pt', target='zh-CN')
            return translator.translate(text)
        except Exception as e:
            time.sleep(random.uniform(1,3))
    return None

def translate_batch(batch_df, batch_id):
    rows = []
    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        futures = {executor.submit(safe_translate, row['review_comment_message']): row for _, row in batch_df.iterrows()}
        for fut in as_completed(futures):
            row = futures[fut]
            res = fut.result()
            if res:
                rows.append({
                    'order_id': row['order_id'],
                    'bucket': row['bucket'],
                    'original': row['review_comment_message'],
                    'translated': res
                })
    out = pd.DataFrame(rows)
    out.to_csv(SAVE_CSV / f'translated_part_{batch_id}.csv', index=False, encoding='utf-8-sig')
    print(f" 第 {batch_id} 批完成，共 {len(out)} 条")
    return out

for i in range(0, len(bad_review_data), BATCH_SIZE):
    batch = bad_review_data.iloc[i:i+BATCH_SIZE]
    batch_id = i // BATCH_SIZE + 1
    print(f"\n正在翻译第 {batch_id} 批，共 {len(batch)} 条...")
    translate_batch(batch, batch_id)
    time.sleep(random.uniform(2,5))  # 防封短暂停


In [10]:
# 1. 读取所有分批翻译文件
files = sorted(glob.glob(str(SAVE_CSV / 'translated_part_*.csv')))
print(f"共检测到 {len(files)} 个翻译结果文件")

# 2. 合并为一个 DataFrame
df_all = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

# 3. 数据清洗
df_all.drop_duplicates(subset=['order_id'], inplace=True)
df_all.dropna(subset=['translated'], inplace=True)

print(f" 合并后共 {len(df_all)} 条评论")
df_all.head()

# 定义关键词
keywords = ['未收到', '慢', '延迟','未发货']

# 定义计算函数
def kw_rate(g):
    total = len(g)
    out = {}
    for kw in keywords:
        out[kw] = round(g['translated'].str.contains(kw, na=False).mean() * 100, 2)
    out['n_reviews'] = total
    return pd.Series(out)

# 分组统计
bucket_stats = df_all.groupby('bucket').apply(kw_rate).reset_index()

bucket_stats.to_csv(SAVE_CSV / "评论关键词频率表.csv")
print(bucket_stats)


NameError: name 'glob' is not defined