<a href="https://colab.research.google.com/github/chenbrilliancesol/machine-learning/blob/main/%E5%9F%BA%E4%BA%8ELightGBM%E5%AF%B9%E7%94%B5%E5%BD%B1%E7%A5%A8%E6%88%BF%E7%9A%84%E9%A2%84%E6%B5%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import ast  # 用于安全地解析字符串形式的字面量（如列表、字典）

# 1. 加载数据
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

# 2. 辅助函数：解析JSON字符串字段并提取信息
def get_json_dict(s):
    try:
        return ast.literal_eval(s) if pd.notna(s) else []
    except:
        return []  # 解析失败返回空列表

def extract_features(df):
    # 创建一个新的DataFrame来处理特征
    features = pd.DataFrame()
    features['id'] = df['id']

    # 数值特征
    features['budget'] = df['budget']
    features['popularity'] = df['popularity']
    features['runtime'] = df['runtime'].fillna(0)  # 填充运行时缺失值

    # 从release_date创建时间特征
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
    features['release_year'] = df['release_date'].dt.year
    features['release_month'] = df['release_date'].dt.month
    features['release_day'] = df['release_date'].dt.day
    features['release_dayofweek'] = df['release_date'].dt.dayofweek

    # 布尔特征
    features['has_homepage'] = df['homepage'].notna().astype(int)
    features['has_tagline'] = df['tagline'].notna().astype(int)
    features['is_collection'] = df['belongs_to_collection'].notna().astype(int)

    # 从JSON字段中提取计数特征
    df['genres_list'] = df['genres'].apply(get_json_dict)
    features['num_genres'] = df['genres_list'].apply(len)

    df['production_companies_list'] = df['production_companies'].apply(get_json_dict)
    features['num_production_companies'] = df['production_companies_list'].apply(len)

    df['spoken_languages_list'] = df['spoken_languages'].apply(get_json_dict)
    features['num_spoken_languages'] = df['spoken_languages_list'].apply(len)

    # 这里可以继续提取更多特征，例如：
    # - 最受欢迎的电影类型（one-hot编码）
    # - 最大牌演员或导演（需要外部数据或复杂处理，注意避免泄漏）
    # - Overview文本长度
    # - 预算的对数：np.log1p(features['budget'])

    # ... 更多特征工程

    return features

# 3. 应用特征工程函数
train_features = extract_features(train_df)
test_features = extract_features(test_df)

# 4. 准备模型输入
# 选择最终使用的特征列
feature_columns = ['budget', 'popularity', 'runtime', 'release_year', 'release_month',
                   'has_homepage', 'has_tagline', 'is_collection', 'num_genres',
                   'num_production_companies', 'num_spoken_languages']
# 注意：实际特征可能更多

X = train_features[feature_columns]
y = np.log1p(train_df['revenue'])  # 对目标变量进行对数变换，符合RMSLE要求
X_test = test_features[feature_columns]

# 处理缺失值（用中位数填充）
X = X.fillna(X.median())
X_test = X_test.fillna(X.median()) # 注意应用训练集的统计量

# 5. 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. 定义和训练LightGBM模型
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=20)])

# 7. 预测并转换回原始尺度（指数变换减1）
val_pred = np.expm1(gbm.predict(X_val, num_iteration=gbm.best_iteration))
# 计算验证集RMSLE (需要将y_val也转换回原始尺度？不对，因为训练时y是对数化的)
# 更严谨的做法是：RMSLE是在原始空间计算的，但我们的预测值已经通过np.expm1变回原始尺度了。
# y_val_original = np.expm1(y_val)
# from sklearn.metrics import mean_squared_log_error
# rmsle = np.sqrt(mean_squared_log_error(y_val_original, val_pred))

# 8. 对测试集进行预测并生成提交文件
test_pred = np.expm1(gbm.predict(X_test, num_iteration=gbm.best_iteration))
submission = pd.DataFrame({'id': test_df['id'], 'revenue': test_pred})
submission.to_csv('submission.csv', index=False)

  df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
  df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[63]	valid_0's rmse: 2.07476
