In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

### DATA挑選

In [2]:
df8=pd.read_csv('fin_imdb_merge_df_clean.csv')

In [3]:
df=pd.read_csv('df_condensed.csv')

In [4]:
# genres_freq編碼 將製片商與電影種類按出現比例編碼(直接照原本的DATA編碼 不做另外挑選處理)
freq=df8['production_companies'].value_counts(normalize=True)
df8['production_companies_freq'] = df8['production_companies'].map(freq)
freq1=df8['genres'].value_counts(normalize=True)
df8['genres_freq'] =df8['genres'].map(freq1)

In [5]:
#合併欄位
merged_df = pd.merge(df, df8, on='movie_id', how='inner')

In [6]:
# 把電影年份挑到2000年後
df2=merged_df[merged_df['year_x']>2000]

In [7]:
# 挑選X要用的欄位 並將部分空職補上
df3=df2[['sequal_x','reunion_holiday','non_reunion_holiday','budget_transfer_x','production_companies_freq','genres_freq']]

In [102]:
#  Encoding categorical features
#pd.Categorical(df8['casts']).codes.shape

In [31]:
# 數值版的y
# df1['make_money1']=df1['revenue_transfer']- df1['budget_transfer']
# df1=df.copy()
# X = df1.drop(['movie_id','imdb_id','title_sum_reg_new','release_date','revenue_transfer','year'] ,axis=1)
X=df3
y=df2['revenue_transfer_x']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=48)
# scaler = preprocessing.StandardScaler().fit(X_train)
# X_train = scaler.transform(X_train)

In [105]:
# 挑選相關係數高的特徵
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
# 選擇相關的欄位
df_selected = df3

# 檢查特徵之間的相關性 皮爾森相關係數 取絕對值
corr_matrix = df_selected.corr().abs()

# # 刪除相關性大於閾值的特徵
to_drop = []

to_drop = list(set(col for col in corr_matrix.columns
                    if col not in to_drop and any(corr_matrix[col].drop(index=[col]+to_drop) > 0.7)))
# 先註解掉 不刪除 在下面看一下
# X = df_selected.drop(to_drop, axis=1).values





In [106]:
# 相關係數高的變數
to_drop

[]

### LASSO

In [107]:
# 數值版
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 建立 Lasso 模型
lasso = Lasso(alpha=0.1)

# 擬合模型
lasso.fit(X, y)

# 印出選擇的特徵和它們的權重
selected_features = df3.columns[lasso.coef_ != 0]
feature_weights = lasso.coef_[lasso.coef_ != 0]
print("Selected features: ", selected_features)
print("Feature weights: ", feature_weights)

Selected features:  Index(['sequal_x', 'reunion_holiday', 'non_reunion_holiday',
       'budget_transfer_x', 'production_companies_freq', 'genres_freq'],
      dtype='object')
Feature weights:  [4.92891146e+07 1.62959295e+06 2.11152217e+06 1.64939529e+08
 4.68691913e+06 7.03521016e+06]


### 隨機森林挑選特徵

In [108]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import make_classification, make_regression
from sklearn.feature_selection import SelectFromModel


# 選擇問題類型（類別或數值型）
problem_type = "regression" # 選擇 "classification" 或 "regression"

if problem_type == "classification":
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
else:
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# 訓練隨機森林模型
rf_model.fit(X, y)

# 獲得特徵重要性
importances = rf_model.feature_importances_

# 特徵重要性排序
indices = np.argsort(importances)[::-1]

# 列印特徵重要性
print("特徵重要性排名：")
for idx, f in enumerate(indices):
    print(f"{idx+1}. feature {f} ({importances[f]:.4f})")

# 篩選重要特徵
sfm = SelectFromModel(rf_model, threshold=0.05)  # 可根據需求調整閾值
sfm.fit(X, y)
X_important = sfm.transform(X)

# 列印篩選後的特徵維度
print(f"篩選後的特徵維度： {X_important.shape[1]}")


特徵重要性排名：
1. feature 3 (0.7486)
2. feature 0 (0.0927)
3. feature 5 (0.0909)
4. feature 4 (0.0533)
5. feature 2 (0.0110)
6. feature 1 (0.0037)
篩選後的特徵維度： 4


### 模型建置

In [109]:
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

# Define the models
lr_model = LinearRegression()
xgb_model = xgb.XGBRegressor(max_depth=8,n_estimators=2000,learning_rate=0.008)
svm_model = SVR()
tree_model = DecisionTreeRegressor()

# Train the models on the training data
lr_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)

# Make predictions on the test data
lr_preds = lr_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
svm_preds = svm_model.predict(X_test)
tree_preds = tree_model.predict(X_test)

# Evaluate the models using R2 score
lr_r2 = r2_score(y_test, lr_preds)
xgb_r2 = r2_score(y_test, xgb_preds)
svm_r2 = r2_score(y_test, svm_preds)
tree_r2 = r2_score(y_test, tree_preds)

# Print the R2 scores
print('LinearRegression R2 score:', lr_r2)
print('XGBoost R2 score:', xgb_r2)
print('SVM R2 score:', svm_r2)
print('Decision Tree R2 score:', tree_r2)


LinearRegression R2 score: 0.5907178356498386
XGBoost R2 score: 0.6337803093268108
SVM R2 score: -0.13021697979998836
Decision Tree R2 score: 0.4861294033856256


### 挑選更少特徵進行比較

In [110]:
X=df2[['sequal_x','budget_transfer_x','production_companies_freq','genres_freq']]
y=df2['revenue_transfer_x']
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=48)

In [111]:
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

# Define the models
lr_model = LinearRegression()
xgb_model = xgb.XGBRegressor(max_depth=8,n_estimators=2000,learning_rate=0.008)
svm_model = SVR()
tree_model = DecisionTreeRegressor()

# Train the models on the training data
lr_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)

# Make predictions on the test data
lr_preds = lr_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
svm_preds = svm_model.predict(X_test)
tree_preds = tree_model.predict(X_test)

# Evaluate the models using R2 score
lr_r2 = r2_score(y_test, lr_preds)
xgb_r2 = r2_score(y_test, xgb_preds)
svm_r2 = r2_score(y_test, svm_preds)
tree_r2 = r2_score(y_test, tree_preds)

# Print the R2 scores
print('LinearRegression R2 score:', lr_r2)
print('XGBoost R2 score:', xgb_r2)
print('SVM R2 score:', svm_r2)
print('Decision Tree R2 score:', tree_r2)


LinearRegression R2 score: 0.5907370059832607
XGBoost R2 score: 0.6366232660854804
SVM R2 score: -0.13021695630246133
Decision Tree R2 score: 0.4879621626062238


### 再次挑選特徵 LASSO、隨機森林

In [112]:
# 數值版
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 建立 Lasso 模型
lasso = Lasso(alpha=0.1)

# 擬合模型
lasso.fit(X, y)

# 打印選擇的特徵和它們的權重 這邊應該還要再改，不然每次都要把新挑選的特徵重新塞進來
selected_features = df2[['sequal_x','budget_transfer_x','production_companies_freq','genres_freq']].columns[lasso.coef_ != 0]
feature_weights = lasso.coef_[lasso.coef_ != 0]
print("Selected features: ", selected_features)
print("Feature weights: ", feature_weights)

Selected features:  Index(['sequal_x', 'budget_transfer_x', 'production_companies_freq',
       'genres_freq'],
      dtype='object')
Feature weights:  [4.92264033e+07 1.64835568e+08 4.68101562e+06 7.04265694e+06]


In [113]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import make_classification, make_regression
from sklearn.feature_selection import SelectFromModel


# 選擇問題類型（類別或數值型）
problem_type = "regression" # 選擇 "classification" 或 "regression"

if problem_type == "classification":
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
else:
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# 訓練隨機森林模型
rf_model.fit(X, y)

# 獲得特徵重要性
importances = rf_model.feature_importances_

# 特徵重要性排序
indices = np.argsort(importances)[::-1]

# 列印特徵重要性
print("特徵重要性排名：")
for idx, f in enumerate(indices):
    print(f"{idx+1}. feature {f} ({importances[f]:.4f})")

# 篩選重要特徵
sfm = SelectFromModel(rf_model, threshold=0.05)  # 可根據需求調整閾值
sfm.fit(X, y)
X_important = sfm.transform(X)

# 列印篩選後的特徵維度
print(f"篩選後的特徵維度： {X_important.shape[1]}")


特徵重要性排名：
1. feature 1 (0.7568)
2. feature 3 (0.0960)
3. feature 0 (0.0929)
4. feature 2 (0.0543)
篩選後的特徵維度： 4


### 最後選用XGBOOST 調整參數


In [None]:
from sklearn.model_selection import train_test_split
df3=df2[['sequal_x','reunion_holiday','non_reunion_holiday','budget_transfer_x','production_companies_freq','genres_freq']]

X=df3
y=df2['revenue_transfer_x']
# 將樣本分為三份 除訓練模型用的Train test外，Val用來驗證調整參數結果 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import xgboost as xgb

# 訓練模型

params = {'max_depth': 16, 'n_estimators': 2000, 'learning_rate': 0.001, 'early_stopping_rounds': 10, 'eval_metric': 'rmse','random_state' :42,'booster' :'dart'}
xgb_model = xgb.XGBRegressor(**params)
xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0)

# 預測結果
y_train_pred = xgb_model.predict(X_train)
y_val_pred = xgb_model.predict(X_val)
y_test_pred = xgb_model.predict(X_test)

# 計算R²
r2_train = r2_score(y_train, y_train_pred)
r2_val = r2_score(y_val, y_val_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"R² on training set: {r2_train:.2f}")
print(f"R² on validation set: {r2_val:.2f}")
print(f"R² on testing set: {r2_test:.2f}")


# 繪製訓練和驗證損失
evals_result = xgb_model.evals_result()
plt.plot(evals_result["validation_0"]["rmse"], label="Train Loss")
plt.plot(evals_result["validation_1"]["rmse"], label="Validation Loss") 
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("Train and Validation Loss vs Iteration")
plt.legend()
plt.show()


### 最終模型定案的話要存檔供FLASK使用(待模型最終超參數調整)

In [74]:
import pickle

# 模型存檔
with open('lr_model.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

In [77]:
import joblib

# 載入模型
lr_model = joblib.load('lr_model.pkl')

# 輸入特徵 (預算、評分)
feature = [[8000000,8]]  
# 進行預測
prediction = lr_model.predict(feature)

print(prediction)

[32390664.62236005]


