In [25]:
import pandas as pd
import datetime as dt
data = pd.read_csv("yt_final.csv", encoding = 'utf8')

In [26]:
data['publish_date'] = pd.to_datetime(data['publish_date'])
data['channel_startdate'] = pd.to_datetime(data['channel_startdate'])
current_date = pd.Timestamp('now')
data['days_since_publish'] = (current_date - data['publish_date']).dt.days
data['channel_age_days'] = (current_date - data['channel_startdate']).dt.days

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [28]:
# 設定特徵及目標變量
features = data.drop(['views','channel_name', 'video_title', 'channel_startdate','cluster','likes', 'comments','publish_date','channel_totalviews(10K)'], axis = 1)
target = data['views']


In [29]:
# Apply One-Hot Encoding to 'category'
one_hot_encoder = OneHotEncoder(sparse=False)
category_encoded = one_hot_encoder.fit_transform(features[['category']])
category_encoded_df = pd.DataFrame(category_encoded, columns=one_hot_encoder.get_feature_names(['category']))

# Drop the original 'category' column and concatenate the one-hot encoded dataframe
features = features.drop('category', axis=1)
features_encoded = pd.concat([features.reset_index(drop=True), category_encoded_df], axis=1)

features_encoded.head()  # Display the first few rows of the processed features


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



Unnamed: 0,duration,subscribers,Sensationalism_Score,days_since_publish,channel_age_days,category_Comedy,category_Education,category_Entertainment,category_Film & Animation,category_Gaming,category_Howto & Style,category_Music,category_News & Politics,category_Nonprofits & Activism,category_People & Blogs,category_Pets & Animals,category_Science & Technology,category_Sports,category_Travel & Events
0,943,398000,16,24,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1285,398000,16,35,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1280,398000,16,45,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,975,398000,22,59,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,798,398000,16,66,4074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [30]:
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2)
# 建立RandomForestRegressor模型
randomForestModel = RandomForestRegressor(random_state=42) #森林中數目數量：100 #評估切割指標：mse/mae
# 使用訓練資料訓練模型
randomForestModel.fit(X_train, y_train)
# 使用訓練資料預測
predicted=randomForestModel.predict(X_test)

In [31]:
print('訓練集: ',randomForestModel.score(X_train,y_train))
print('測試集: ',randomForestModel.score(X_test,y_test))
mse = mean_squared_error(y_test, predicted)
print(f"Mean Squared Error: {mse}")

訓練集:  0.9692406961002942
測試集:  0.8287996575779121
Mean Squared Error: 204239678322.62057


### 特徵重要性

In [None]:
import plotly.graph_objects as go
feature_importances = randomForestModel.feature_importances_
feature_names = X_train.columns

# 創建特徵重要性的DataFrame
importances_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importances_df = importances_df.sort_values(by='importance', ascending=False)

# 創建圖表
fig = go.Figure(go.Bar(
    x=importances_df['feature'][:10],
    y=importances_df['importance'][:10],
    text=importances_df['importance'][:10].apply(lambda x: f'{x:.2f}'),
    textposition='outside',
    marker_color='#87ceeb',
    width=0.6
))

# 調整布局
fig.update_layout(
    title={
        'text': 'Top 10 Feature Importances in Random Forest Model',
        'y':0.98,  # 標題在垂直方向的位置
        'x':0.5,  # 標題在水平方向的位置
        'xanchor': 'center',  # 確保標題的中心在 x 的位置
        'yanchor': 'top'      # 確保標題的底部在 y 的位置
    },
    xaxis=dict(
        title='Feature',
        tickangle=-45,
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        title='Importance',
        range=[0, 1.1 * max(importances_df['importance'][:10])]
    ),
    margin=dict(t=50),  # 可能需要根據標題的大小調整邊距
    showlegend=False,
    bargap=0.05,
    width=600, # 調整圖表的寬度
)

# 顯示圖表
fig.show()

### 實際值vs預測值

In [None]:
import plotly.express as px

# Assuming y_test and y_pred are already defined
fig = px.scatter(x=y_test, y=predicted , labels={'x': 'Actual Views', 'y': 'Predicted Views'},
                 title='Actual vs Predicted Views', width=600, color_discrete_sequence=['#87ceeb'], opacity=0.7)
fig.add_shape(type='line', 
              x0=y_test.min(), y0=y_test.min(), 
              x1=y_test.max(), y1=y_test.max(),
              line=dict(color='gray', dash='dash'))
fig.update_layout(
    title={
        'y':0.85,  # 標題在垂直方向的位置
        'x':0.5,  # 標題在水平方向的位置
        'xanchor': 'center',  # 確保標題的中心在 x 的位置
        'yanchor': 'top'      # 確保標題的底部在 y 的位置
    })

fig.show()