In [5]:
import pandas as pd
import numpy as np

In [None]:
def feature_engineering(df):
    df = df.copy()

    # Target variable transformation
    df['engagement_rate_log'] = (
        np.log1p(df['likes']) + np.log1p(df['comments'])
    ) / np.log1p(df['views'])

    # Log transform subscriber count
    df['subscriber_count_log'] = np.log1p(df['subscriber_count'])

    # Extract temporal features
    df['publish_hour'] = df['published_at'].dt.hour
    df['publish_dayofweek'] = df['published_at'].dt.dayofweek

    # Cyclical encoding for hour of day and day of week
    df['publish_hour_sin'] = np.sin(2*np.pi*df['publish_hour']/24)
    df['publish_hour_cos'] = np.cos(2*np.pi*df['publish_hour']/24)
    df['publish_dow_sin'] = np.sin(2*np.pi*df['publish_dayofweek']/7)
    df['publish_dow_cos'] = np.cos(2*np.pi*df['publish_dayofweek']/7)

    # Video definition encoding
    df['is_hd'] = (df['video_definition'] == "hd").astype(int)

    # Title text features
    df['title_length'] = df['title'].str.len()
    df['title_words'] = df['title'].str.split().str.len()
    df['title_exclamation'] = df['title'].str.count('!')
    df['title_question'] = df['title'].str.count(r'\?')

    return df


def drop_unused_columns(df):

    """ Drop columns that are not needed for modeling.
    """
    
    drop_cols = [
        'video_id','topic','title','published_at','channel_title',
        'duration_seconds','views','likes','comments',
        'subscriber_count','publish_hour','publish_dayofweek',
        'video_definition'
    ]
    return df.drop(columns=drop_cols, errors="ignore")
