In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
import os
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib
import os
import pickle
from sklearn.model_selection import train_test_split
# Add new system path to import config file
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../")))
# Build path two levels up
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../../"))


In [3]:
%%capture
%run ../eda/read_data.ipynb

# --------------------------
# 0. Train Test Split
# --------------------------

In [24]:
# Train-test split
train_X, val_X = train_test_split(train_df, test_size=0.2, random_state=42)

# --------------------------
# 1. User Tower Feature Engineering
# --------------------------

In [15]:

def preprocess_user_features(users: pd.DataFrame, save_dir: str = "user_encoders") -> pd.DataFrame:
    """Encode categorical and time-based user features and save LabelEncoders."""
    user_features = users.copy()
    
    # Create directory to save encoders if not exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Define categorical columns to encode
    categorical_user_cols = ['platform', 'os_version', 'model', 'networkType', 'district', 'language_selected']

    # Encode categorical user features
    for col in categorical_user_cols:
        encoder_path = os.path.join(save_dir, f"user_{col}_encoder.pkl")
        # Check if encoder exists
        if os.path.isfile(encoder_path):
            # Load existing encoder
            le = joblib.load(encoder_path)
        else:
            # Fit new encoder and save
            le = LabelEncoder()
            le.fit(user_features[col].astype(str))
            joblib.dump(le, encoder_path)
        # Apply encoder to column
        user_features[col] = le.transform(user_features[col].astype(str))

    # Convert timestamps to UTC
    user_features['last_active_at'] = pd.to_datetime(user_features['last_active_at'], utc=True, errors='coerce')
    user_features['created_datetime'] = pd.to_datetime(user_features['created_datetime'], utc=True, errors='coerce')

    # Current UTC time
    now = pd.Timestamp.now(tz='UTC')

    # Create recency features
    user_features['days_since_last_active'] = (now - user_features['last_active_at']).dt.days
    user_features['days_since_signup'] = (now - user_features['created_datetime']).dt.days

    # Fill missing values with 0
    return user_features.fillna(0)

In [27]:
processed_users = preprocess_user_features(users)

# --------------------------
# 2. Content Tower Feature Engineering
# --------------------------

In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib
import os

def preprocess_content_features(train_df: pd.DataFrame, save_dir: str = "content_encoders", max_features: int = 5000, n_svd: int = 128):
    """Encode categorical and text features for content; save/reuse encoders, TF-IDF, and SVD."""
    os.makedirs(save_dir, exist_ok=True)
    categorical_content_cols = ['newsType', 'newsLanguage', 'sourceName', 'newsDistrict']
    content_features = train_df.copy()

    # Encode categorical columns
    for col in categorical_content_cols:
        encoder_path = os.path.join(save_dir, f"content_{col}_encoder.pkl")
        if os.path.isfile(encoder_path):
            le = joblib.load(encoder_path)
        else:
            le = LabelEncoder()
            le.fit(content_features[col].astype(str))
            joblib.dump(le, encoder_path)
        content_features[col] = le.transform(content_features[col].astype(str))

    # Prepare text: title*3 + content
    content_features['text'] = content_features['title'].fillna('') * 3 + " " + content_features['content'].fillna('')

    # TF-IDF Vectorizer
    tfidf_path = os.path.join(save_dir, "tfidf_vectorizer.pkl")
    if os.path.isfile(tfidf_path):
        tfidf = joblib.load(tfidf_path)
        text_features = tfidf.transform(content_features['text'])
    else:
        tfidf = TfidfVectorizer(max_features=max_features)
        text_features = tfidf.fit_transform(content_features['text'])
        joblib.dump(tfidf, tfidf_path)

    # Truncated SVD for dimensionality reduction
    svd_path = os.path.join(save_dir, "svd_model.pkl")
    if os.path.isfile(svd_path):
        svd = joblib.load(svd_path)
        text_emb = svd.transform(text_features)
    else:
        svd = TruncatedSVD(n_components=n_svd, random_state=42)
        text_emb = svd.fit_transform(text_features)
        joblib.dump(svd, svd_path)

    # Concatenate embeddings with original dataframe
    text_emb_df = pd.DataFrame(text_emb, columns=[f'text_emb_{i}' for i in range(text_emb.shape[1])])
    content_features = pd.concat([content_features.reset_index(drop=True), text_emb_df], axis=1)

    return content_features

In [25]:
train_X = preprocess_content_features(train_X)
val_X = preprocess_content_features(val_X)


# --------------------------
# 3. Interaction / Label Engineering
# --------------------------

In [None]:

def create_training_data(events: pd.DataFrame, user_features: pd.DataFrame, content_features: pd.DataFrame):
    """Assign engagement scores and merge only selected content events with user/content features."""
    
    selected_hashes = list(content_features['hashid'].unique())
    
    # Filter events to only include selected hashIds
    events = events[events['hashId'].isin(selected_hashes)].copy()

    # Assign engagement scores to event types
    event_weights = {
        'TimeSpent-Front': 0.3,
        'TimeSpent-Back': 0.5,
        'News Bookmarked': 1.0,
        'News Shared': 1.0
    }
    events['engagement_score'] = events['event_type'].map(event_weights).fillna(0)

    # Convert timestamp to UTC
    events['eventTimestamp'] = pd.to_datetime(events['eventTimestamp'], unit='ms', utc=True)

    # Merge with user features
    train_df = events.merge(user_features, left_on='deviceId', right_on='deviceid', how='left', suffixes=('', '_user'))
    # Merge with content features
    train_df = train_df.merge(content_features, left_on='hashId', right_on='hashid', how='left', suffixes=('', '_content'))

    # User and content tower columns
    user_tower_cols = ['platform', 'os_version', 'model', 'networkType', 'district_user', 'language_selected',
                       'days_since_last_active', 'days_since_signup']
    content_tower_cols = ['newsType', 'newsLanguage', 'sourceName', 'newsDistrict'] + [f'text_emb_{i}' for i in range(128)]

    # Target
    target_col = 'engagement_score'

    # Split features and target
    X_user = train_df[user_tower_cols]
    X_content = train_df[content_tower_cols]
    y = train_df[target_col]

    return X_user, X_content, y

In [30]:
tX_user, tX_content, ty = create_training_data(events=events, user_features=processed_users, content_features=train_X)
vX_user, vX_content, vy = create_training_data(events=events, user_features=processed_users, content_features=val_X)

In [None]:
# Save training and validation data using pickle

data_to_save = {
    "tX_user": tX_user,
    "tX_content": tX_content,
    "ty": ty,
    "vX_user": vX_user,
    "vX_content": vX_content,
    "vy": vy
}

with open("train_val_data.pkl", "wb") as f:
    pickle.dump(data_to_save, f)

print("✅ Training and validation data saved to train_val_data.pkl")

In [None]:
# Load training and validation data using pickle

with open("train_val_data.pkl", "rb") as f:
    data = pickle.load(f)

tX_user = data["tX_user"]
tX_content = data["tX_content"]
ty = data["ty"]
vX_user = data["vX_user"]
vX_content = data["vX_content"]
vy = data["vy"]

print("✅ Data loaded successfully from pickle")