In [None]:
import pandas as pd
import numpy as np
import logging
import re
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib
import os

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Load & Preprocess Data ---
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)

    def is_english(title):
        cleaned = re.sub(r'[^\x00-\x7F]+', '', title)
        return len(cleaned) / len(title) > 0.9

    print("remove non english")
    
    df = df[df['title'].apply(is_english)]
    df = df.drop(['video_id', 'channelId', 'trending_date', 'likes', 'dislikes',
                'thumbnail_link', 'comment_count'], axis=1)
    df['publishedAt'] = pd.to_datetime(df['publishedAt'])
    df['year'] = df['publishedAt'].dt.year
    df['month'] = df['publishedAt'].dt.month
    df['date'] = df['publishedAt'].dt.day
    df = df.drop_duplicates().dropna()
    df['comments_disabled'] = df['comments_disabled'].astype(int)
    df['ratings_disabled'] = df['ratings_disabled'].astype(int)
    return df

# --- Outlier Removal ---
def remove_outliers_iqr(df):
    df_filtered = df.copy()
    for column in df_filtered.select_dtypes(include=['number']).columns:
        if column == 'view_count':
            continue
        Q1 = df_filtered[column].quantile(0.25)
        Q3 = df_filtered[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_filtered = df_filtered[(df_filtered[column] >= lower_bound) & (df_filtered[column] <= upper_bound)]
    return df_filtered

# --- Word2Vec Feature Scoring ---
def compute_word2vec_scores(df, text_columns, target_column, vector_size=100, save_path='models'):
    result_df = df.copy()
    y = np.log1p(df[target_column])

    if not os.path.exists(save_path):
        os.makedirs(save_path)

    logging.info('Preparing corpus...')
    corpus = []
    for _, row in df.iterrows():
        for col in text_columns:
            if col == 'tags':
                tokens = [tag.strip('"').lower() for tag in str(row[col]).split('|')]
            else:
                tokens = simple_preprocess(str(row[col]))
            corpus.append(tokens)

    logging.info('Training Word2Vec...')
    w2v_model = joblib.load('./models/word2vec_model.pkl')
    # w2v_model_path = os.path.join(save_path, 'word2vec_model.pkl')
    # joblib.dump(w2v_model, w2v_model_path)
    # logging.info(f"Word2Vec model saved to {w2v_model_path}")

    def get_avg_vector(tokens):
        vectors = [w2v_model.wv[token] for token in tokens if token in w2v_model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

    for col in text_columns:
        logging.info(f"Scoring: {col}")
        if col == 'tags':
            text_data = df[col].apply(lambda x: [tag.strip('"').lower() for tag in str(x).split('|')])
        else:
            text_data = df[col].apply(simple_preprocess)

        vectors = np.array([get_avg_vector(tokens) for tokens in text_data])

        ridge_path = os.path.join(save_path, f'{col}_ridge_model.pkl')
        ridge = joblib.load(ridge_path)
        # ridge.fit(vectors, y)
        scores = ridge.predict(vectors)

        # Save Ridge model
        # ridge_path = os.path.join(save_path, f'{col}_ridge_model.pkl')
        # joblib.dump(ridge, ridge_path)
        # logging.info(f"Ridge model for {col} saved to {ridge_path}")

        result_df[f'{col}_score'] = scores

    return result_df

In [None]:
df_all = load_and_preprocess_data('new_data_v1346_processed.csv')

In [None]:
text_columns = ['title', 'tags', 'description'] 
df_all = compute_word2vec_scores(df_all, text_columns, 'view_count', save_path='models')

In [None]:
rows_to_drop = []
for index, row in df_all.iterrows():
    if 'short' in str(row['title']).lower():
        rows_to_drop.append(index)

df = df_all.drop(rows_to_drop)

In [None]:
from sklearn.preprocessing import LabelEncoder
df_all['channelId'] = LabelEncoder().fit_transform(df_all['channelTitle'])
df_all['channelId'] = df_all['channelId'].astype(int)

In [None]:
import json

# Convert channels DataFrame to dictionary: key=channel_title, value=encoded_id (as integer)
channels_dict = {row['channelTitle']: row['channelId'] for _, row in df_all.iterrows()}

# Save channels to JSON
with open('channels.json', 'w') as f:
    json.dump(channels_dict, f, indent=4)

In [None]:
# --- Tag Count Feature ---
df_all['tags'] = df_all['tags'].astype(str).apply(lambda x: [tag.strip('"') for tag in x.split('|')])
df_all['tag_count'] = df_all['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [None]:
df_all = df_all.sort_values('publishedAt')

In [None]:
# --- Drop Unused Columns ---
df_all = df_all.sort_values('publishedAt')
# df_all = df_all.drop(['title', 'categoryId', 'publishedAt', 'tags', 'description', 'channelTitle'], axis=1)
df_all = df_all.drop(['title', 'publishedAt', 'tags', 'description'], axis=1)

In [None]:
# --- Remove Outliers ---
df_all = remove_outliers_iqr(df_all)

In [None]:
# df_all = df_all.drop(['Unnamed: 0'], axis=1)
df_all = df_all.drop(['Unnamed: 0', 'channelTitle'], axis=1)

In [None]:
import json

# Load JSON from file
with open('channels.json', 'r') as file:
    categories = json.load(file)

# Create a list of tuples with (first 3 letters, channel_name, channel_id)
category_list = [
    (channel_name[:3].lower(), channel_name, channel_id)
    for channel_name, channel_id in categories.items()
]

# Sort by the first 3 letters
category_list.sort(key=lambda x: x[0])

# Create sorted dictionary
sorted_categories = {
    channel_name: channel_id
    for _, channel_name, channel_id in category_list
}

# Convert to JSON
json_output = json.dumps(sorted_categories, indent=2, ensure_ascii=False)

# Print the JSON
print(json_output)

In [None]:
import json

# Load JSON from file
with open('channels.json', 'r', encoding='utf-8') as file:
    categories = json.load(file)

# Create a list of tuples with (first 3 letters, channel_name, channel_id)
category_list = [
    (channel_name[:3].lower(), channel_name, channel_id)
    for channel_name, channel_id in categories.items()
]

# Sort by the first 3 letters
category_list.sort(key=lambda x: x[0])

# Create sorted dictionary
sorted_categories = {
    channel_name: channel_id
    for _, channel_name, channel_id in category_list
}

# Convert to JSON string for printing
json_output = json.dumps(sorted_categories, indent=2, ensure_ascii=False)

# Print the JSON
print(json_output)

# Save the sorted dictionary to channels.json
with open('channels.json', 'w', encoding='utf-8') as file:
    json.dump(sorted_categories, file, indent=4, ensure_ascii=False)

In [None]:
model = XGBRegressor()
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}
grid_search = GridSearchCV(model, param_grid, cv=TimeSeriesSplit(n_splits=5), scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val),(X_test, y_test)], verbose=False)
best_model = grid_search.best_estimator_

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# --- Prepare Data ---
X = df_all.drop('view_count', axis=1)
y = np.log1p(df_all['view_count'])  # log-transformed target

split_idx = int(len(X) * 0.7)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

split_idx = int(len(X_test) * 0.5)
X_test, X_val = X_test.iloc[:split_idx], X_test.iloc[split_idx:]
y_test, y_val = y_test.iloc[:split_idx], y_test.iloc[split_idx:]

# Train Model 
model = XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.1, eval_metric='rmse')

# Pass eval_set in fit(), not in constructor
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val),(X_test, y_test)], verbose=False)

# Save best model
best_model = model

In [None]:
# --- Evaluation ---
y_pred = best_model.predict(X_test)
y_pred_unlog = np.expm1(y_pred)
y_test_unlog = np.expm1(y_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test_unlog, y_pred_unlog)

logging.info(f"Model Evaluation - MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

# Save Final Model 
model_path = os.path.join('models', 'xgb_model_all_genres.pkl')
joblib.dump(best_model, model_path)
logging.info(f"Final XGBoost model saved to {model_path}")

In [None]:
evals_result = model.evals_result()
epochs = len(evals_result['validation_0']['rmse'])
x_axis = range(epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, evals_result['validation_0']['rmse'], label='Train')
plt.plot(x_axis, evals_result['validation_1']['rmse'], label='Val')
plt.plot(x_axis, evals_result['validation_2']['rmse'], label='Test')
plt.xlabel('Epoch')
plt.ylabel('RMSE')
plt.title('XGBoost RMSE over Epochs')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('rmse_learning_curve.png')
plt.show()

In [None]:
importance = best_model.get_booster().get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
    
plt.figure(figsize=(10, 6))
plt.barh([x[0] for x in sorted_importance[:10]], [x[1] for x in sorted_importance[:10]])
plt.xlabel('Gain')
plt.gca().invert_yaxis()
plt.show()

In [None]:
array1 = y_test_unlog[50:100]
array2 = y_pred_unlog[50:100]
x = np.arange(len(array1))  
width = 0.5
plt.barh(x - width/2, array1, width, label='Array 1')
plt.barh(x, array2, width, label='Array 2')
plt.legend({'actual','predict'})
plt.show()