In [17]:
import pandas as pd
import datetime as dt
data = pd.read_csv("yt_final.csv", encoding = 'utf8')

In [18]:
data['publish_date'] = pd.to_datetime(data['publish_date'])
data['channel_startdate'] = pd.to_datetime(data['channel_startdate'])
current_date = pd.Timestamp('now')
data['days_since_publish'] = (current_date - data['publish_date']).dt.days
data['channel_age_days'] = (current_date - data['channel_startdate']).dt.days

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [20]:
# 設定特徵及目標變量
features = data.drop(['views','channel_name', 'video_title', 'channel_startdate','cluster','likes', 'comments','publish_date','channel_totalviews(10K)'], axis = 1)
target = data['views']


In [21]:
# Apply One-Hot Encoding to 'category'
one_hot_encoder = OneHotEncoder(sparse=False)
category_encoded = one_hot_encoder.fit_transform(features[['category']])
category_encoded_df = pd.DataFrame(category_encoded, columns=one_hot_encoder.get_feature_names(['category']))

# Drop the original 'category' column and concatenate the one-hot encoded dataframe
features = features.drop('category', axis=1)
features_encoded = pd.concat([features.reset_index(drop=True), category_encoded_df], axis=1)

features_encoded.head()  # Display the first few rows of the processed features



Unnamed: 0,duration,subscribers,Sensationalism_Score,days_since_publish,channel_age_days,category_Comedy,category_Education,category_Entertainment,category_Film & Animation,category_Gaming,category_Howto & Style,category_Music,category_News & Politics,category_Nonprofits & Activism,category_People & Blogs,category_Pets & Animals,category_Science & Technology,category_Sports,category_Travel & Events
0,943,398000,16,24,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1285,398000,16,35,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1280,398000,16,45,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,975,398000,22,59,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,798,398000,16,66,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [22]:
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2)
# 建立RandomForestRegressor模型
model = GradientBoostingRegressor(random_state=42)
# 定義要搜索的参数分布
param_distributions = {
    'n_estimators': randint(50, 400), # 樹的數量，在50到400之間隨機取值
    'learning_rate': uniform(0.01, 0.2), # 學習率，在0.01到0.2之間隨機取值
    'max_depth': randint(3, 10), # 樹的最大深度，在3到10之間隨機取值
    'subsample': uniform(0.7, 0.3), # 用於擬合的訓練數據的比例，在0.7到1.0之間隨機取值
    'min_samples_split': randint(2, 10) # 拆分内部節點所需的最少樣本數，在2到10之間隨機取值
} 
# 建立RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=100, cv=5, scoring='r2', n_jobs=-1, random_state=42)
# 執行隨機搜索
random_search.fit(X_train, y_train)

# 最佳參數組合與評分
best_parameters = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_parameters}")
print(f"Best Score (r2): {best_score}")

Best Parameters: {'learning_rate': 0.07161215837047784, 'max_depth': 6, 'min_samples_split': 4, 'n_estimators': 71, 'subsample': 0.7731968930137251}
Best Score (r2): 0.8121992369064627


In [23]:
# 使用這些參數初始化一個新的 GradientBoostingRegressor
optimized_model = GradientBoostingRegressor(
    n_estimators=best_parameters['n_estimators'],
    learning_rate=best_parameters['learning_rate'],
    max_depth=best_parameters['max_depth'],
    subsample=best_parameters['subsample'],
    min_samples_split=best_parameters['min_samples_split'],
    random_state=42
)
# 訓練模型
optimized_model.fit(X_train, y_train)
#進行預測
predicted = optimized_model.predict(X_test)
# 評估模型性能
optimized_score = optimized_model.score(X_test, y_test)
print(f"Optimized Model Score (r2) on Test Data: {optimized_score}")

Optimized Model Score (r2) on Test Data: 0.8529520387184277


### 特徵重要性

In [25]:
import plotly.graph_objects as go
feature_importances = optimized_model.feature_importances_
feature_names = X_train.columns

# 建立特徵重要性的DataFrame
importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importances_df = importances_df.sort_values(by='importance', ascending=False)

# 建立圖表
fig = go.Figure(go.Bar(
    x=importances_df['feature'][:10],
    y=importances_df['importance'][:10],
    text=importances_df['importance'][:10].apply(lambda x: f'{x:.2f}'),
    textposition='outside',
    marker_color='#87ceeb',
    width=0.6
))

# 調整布局
fig.update_layout(
    title={
        'text': 'Top 10 Feature Importances in Gradient Boosting Regressor Model',
        'y':0.98,  # 標題在垂直方向的位置
        'x':0.5,  # 標題在水平方向的位置
        'xanchor': 'center',  # 確保標題的中心在 x 的位置
        'yanchor': 'top'      # 確保標題的底部在 y 的位置
    },
    xaxis=dict(
        title='Feature',
        tickangle=-45,
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        title='Importance',
        range=[0, 1.1 * max(importances_df['importance'][:10])]
    ),
    margin=dict(t=50),  # 可能需要根據標題的大小調整邊距
    showlegend=False,
    bargap=0.05,
    width=600, # 調整圖表的寬度
)

# 顯示圖表
fig.show()

### 實際值vs預測值

In [26]:
import plotly.express as px

fig = px.scatter(x=y_test, y=predicted , labels={'x': 'Actual Views', 'y': 'Predicted Views'},
                 title='Actual vs Predicted Views', width=600, color_discrete_sequence=['#87ceeb'], opacity=0.7)
fig.add_shape(type='line', 
              x0=y_test.min(), y0=y_test.min(), 
              x1=y_test.max(), y1=y_test.max(),
              line=dict(color='gray', dash='dash'))
fig.update_layout(
    title={
        'y':0.85,  # 標題在垂直方向的位置
        'x':0.5,  # 標題在水平方向的位置
        'xanchor': 'center',  # 確保標題的中心在 x 的位置
        'yanchor': 'top'      # 確保標題的底部在 y 的位置
    })

fig.show()