In [1]:
import pandas as pd

# 讀取CSV文件
file_path = r'C:\Users\User\Dropbox\Movie\Movie.csv'
data = pd.read_csv(file_path)

# 計算描述性統計數據
audience_stats = data['Audience'].describe()
print(audience_stats)


count    74232.000000
mean        82.741958
std        175.138780
min          0.000000
25%          9.000000
50%         27.000000
75%         82.000000
max       4406.000000
Name: Audience, dtype: float64


In [2]:
missing_value_columns = [
    'No_Movie_Length', 'No_Budget', 'No_IMDb', 'No_Oscar', 'Class_#N/A',
    'No_Cannes', 'No_GHA', 'No_TFF', 'TFF_NomData', 'No_Weather', 'Subsidy_Yes'
]

available_columns = [col for col in missing_value_columns if col in data.columns]
print("Available columns with missing value indicators:", available_columns)


Available columns with missing value indicators: ['No_Movie_Length', 'No_Budget', 'No_IMDb', 'No_Oscar', 'Class_#N/A', 'No_Cannes', 'No_GHA', 'No_TFF', 'TFF_NomData', 'No_Weather', 'Subsidy_Yes']


In [4]:
# 線性回歸
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 讀取CSV文件
data = pd.read_csv(file_path)

# 選擇特徵和目標變數
features = [  
    'PlayID', 'MovieID', 'PlayWeek', 'Weekend', 'Play_nWeek', 'PlayMonth', 'PlayYear', 'ViewDay', 'CutBack', 'GHA_View_Code', 'TFF_View_Code', 
    'Theater_Class', 'Tempture', 'Rainfall', 'MovieFest', 'Rated', 'Class_Darma', 'Class_Comedy', 'Class_Action', 'Class_Documentary', 
    'Class_Animation', 'Class_Other', 'Movie_Length', 'Director_Award', 'Producer_Famous', 'Distrbutor', 'Actor_Awardall', 
    'Audience', 'Oscar_Nom', 'Oscar_AwardTotal', 'Cannes_Nom', 'Cannes_AwardTotal', 'GHA_Nom', 'GHA_AwardTotal', 'TFF_Nom', 'TFF_AwardTotal', 
    'view_afhorse', 'view_aftaipei', 'year_2018', 'year_2019', 'year_2020', 'year_2021', 'year_2022', 'month_2', 'month_3', 'month_4', 'month_5', 
    'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'view_week_1', 'view_week_2', 'view_week_3', 'view_week_4', 
    'view_week_58', 'view_week_912', 'theater_1', 'theater_2', 'theater_3', 'theater_4', 'rate_1', 'rate_2', 'rate_3', 'rate_4', 'distributor_1', 
    'distributor_2', 'distributor_3', 'lnsubsidy', 'lnbudget'
]

X = data[features]
y = data['IMDb_rating']

# 將數據分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 訓練線性迴歸模型
model = LinearRegression()
model.fit(X_train, y_train)

# 預測測試集
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 2.1805572964468394
R^2 Score: 0.4080285681885817


In [5]:
# 多項式特徵
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# 創建多項式特徵
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

# 分割數據
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# 訓練模型
model = LinearRegression()
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 4.8862396808823405
R^2 Score: -0.3265023142381822


In [6]:
# 隨機森林
from sklearn.ensemble import RandomForestRegressor

# 分割數據
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 訓練隨機森林迴歸模型
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 0.01842124658180104
R^2 Score: 0.9949990528877415


In [6]:
# 對數轉換
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 讀取CSV文件
data = pd.read_csv(file_path)

# 處理缺失值
data.fillna(0, inplace=True)

# 對Audience進行對數變換
data['Log_Audience'] = np.log1p(data['Audience'])  # 使用log1p來避免log(0)的情況

# 選擇特徵和目標變數
features = [
    'No_Movie_Length', 'No_Budget', 'No_IMDb', 'No_Oscar', 'Class_#N/A',
    'No_Cannes', 'No_GHA', 'No_TFF', 'TFF_NomData', 'No_Weather', 'Subsidy_Yes'
]
X = data[features]
y = data['Log_Audience']

# 將數據分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 訓練線性迴歸模型
model = LinearRegression()
model.fit(X_train, y_train)

# 預測測試集
y_pred = model.predict(X_test)

# 反變換預測結果
y_test_exp = np.expm1(y_test)  # 將log變換後的值反變換回來
y_pred_exp = np.expm1(y_pred)

# 評估模型
mse = mean_squared_error(y_test_exp, y_pred_exp)
r2 = r2_score(y_test_exp, y_pred_exp)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')



Mean Squared Error: 35874.37459525577
R^2 Score: -0.07242413665924707


In [7]:
# 隨機森林
from sklearn.ensemble import RandomForestRegressor

# 選擇特徵和目標變數
X = data[features]
y = data['Audience']

# 將數據分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 訓練隨機森林迴歸模型
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 預測測試集
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 31691.152336721447
R^2 Score: 0.052628594424619646


In [9]:
# 添加多項式和交互特徵

from sklearn.preprocessing import PolynomialFeatures

# 添加多項式和交互特徵
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_poly = poly.fit_transform(X)

# 分割數據
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# 訓練模型
model = LinearRegression()
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 31828.114727512293
R^2 Score: 0.048534257579564644


In [10]:
# 超參數調整
from sklearn.model_selection import GridSearchCV

# 定義參數範圍
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 訓練隨機森林模型並進行超參數調整
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# 最佳參數
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# 使用最佳參數進行預測
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Mean Squared Error: 31691.252397666125
R^2 Score: 0.05262560321506071


In [11]:
# XGBOOST

from xgboost import XGBRegressor

# 訓練XGBoost模型
model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)

# 評估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 31695.219760070868
R^2 Score: 0.052507003371865535


In [12]:
# 特徵工程
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

# 讀取CSV文件
data = pd.read_csv(file_path)

# 處理缺失值
data.fillna(0, inplace=True)

# 添加更多特徵
data['Audience_Budget_Ratio'] = data['Audience'] / (data['No_Budget'] + 1)
data['Log_Audience'] = np.log1p(data['Audience'])  # 使用log1p來避免log(0)的情況

# 選擇特徵和目標變數
features = [
    'No_Movie_Length', 'No_Budget', 'No_IMDb', 'No_Oscar', 'Class_#N/A',
    'No_Cannes', 'No_GHA', 'No_TFF', 'TFF_NomData', 'No_Weather', 'Subsidy_Yes',
    'Audience_Budget_Ratio'
]
X = data[features]
y = data['Log_Audience']

# 將數據分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用多項式特徵
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# 訓練線性迴歸模型
model = LinearRegression()
model.fit(X_train_poly, y_train)

# 預測
y_pred = model.predict(X_test_poly)

# 反變換預測結果
y_test_exp = np.expm1(y_test)
y_pred_exp = np.expm1(y_pred)

# 評估模型
mse = mean_squared_error(y_test_exp, y_pred_exp)
r2 = r2_score(y_test_exp, y_pred_exp)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 1.5256148374007808e+16
R^2 Score: -456065420883.14215


In [13]:
# 集成方法
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# 構建個體模型
rf_model = RandomForestRegressor(n_estimators=200, max_depth=None, min_samples_split=5, min_samples_leaf=1, random_state=42)
xgb_model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

# 構建集成模型
ensemble_model = VotingRegressor(estimators=[('rf', rf_model), ('xgb', xgb_model)])
ensemble_model.fit(X_train, y_train)

# 預測
y_pred = ensemble_model.predict(X_test)

# 反變換預測結果
y_test_exp = np.expm1(y_test)
y_pred_exp = np.expm1(y_pred)

# 評估模型
mse = mean_squared_error(y_test_exp, y_pred_exp)
r2 = r2_score(y_test_exp, y_pred_exp)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 370.32093181421527
R^2 Score: 0.9889296716648482


In [14]:
# 進一步調參
from sklearn.model_selection import GridSearchCV

# 定義參數範圍
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'learning_rate': [0.01, 0.1, 0.2]
}

# 使用XGBoost進行超參數調整
model = XGBRegressor(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

# 最佳參數
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# 使用最佳參數進行預測
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 反變換預測結果
y_test_exp = np.expm1(y_test)
y_pred_exp = np.expm1(y_pred)

# 評估模型
mse = mean_squared_error(y_test_exp, y_pred_exp)
r2 = r2_score(y_test_exp, y_pred_exp)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

Parame

Best Parameters: {'learning_rate': 0.1, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error: 1105.0647821178231
R^2 Score: 0.9669653294786087


Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

