In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import kagglehub
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier

import torch
from torch import Tensor
import torch.nn as nn

# Helper Functions

In [2]:
def merge_data(feeds: pd.DataFrame, ads: pd.DataFrame) -> pd.DataFrame:
    # aggregate news feed info
    feeds = feeds.groupby('u_userId').agg(
        total_clicks=('u_userId', 'count'),
        avg_refresh_times=('u_refreshTimes', 'mean'),
        total_dislikes=('i_dislikeTimes', 'sum'),
        total_upvotes=('i_upTimes', 'sum'),
        unique_news_categories=('u_newsCatInterestsST', lambda x: len(set("^".join(x).split("^")))),
        most_common_category=('u_newsCatInterestsST', lambda x: max(set("^".join(x).split("^")), key=("^".join(x).split("^")).count))
    ).reset_index()

    feeds.rename(columns={'u_userId': 'user_id'}, inplace=True)

    merged = ads.merge(feeds, on='user_id', how='left')
    merged['ctr_news'] = merged['total_clicks'] / (merged['total_clicks'] + 1)
    merged['category_diversity'] = merged['unique_news_categories'] / merged['unique_news_categories'].max()

    merged.fillna({
        'total_clicks': 0,
        'avg_refresh_times': merged['avg_refresh_times'].median(),
        'total_dislikes': 0,
        'total_upvotes': 0,
        'category_diversity': 0
    }, inplace=True)

    return merged



def load_data(ads_path: Path, feeds_path: Path) -> pd.DataFrame:
  feeds: pd.DataFrame = pd.read_csv(feeds_path)
  ads: pd.DataFrame = pd.read_csv(ads_path)
  return merge_data(ads=ads, feeds=feeds)


def split_data(data: pd.DataFrame, split: float=0.9) -> tuple[pd.DataFrame, pd.DataFrame]:
  selected_i = np.random.permutation(data.index)
  split_i: int = int(len(data) * split)
  return data.loc[selected_i[:split_i]], data.loc[selected_i[split_i:]]

# Load Data

In [17]:
data_path: Path = Path(kagglehub.dataset_download("xiaojiu1414/digix-global-ai-challenge"))

# Load the full dataset
training_data = load_data(
    feeds_path=data_path / "train" / "train_data_feeds.csv",
    ads_path=data_path / "train" / "train_data_ads.csv"
)

small_data = training_data.sample(frac=0.01, random_state=42).reset_index(drop=True)

# Split into training (80%) and test (20%)
training_data, test_data = train_test_split(small_data, test_size=0.2, random_state=42)

# Further split training into training (80%) and validation (20%)
training_data, validation_data = train_test_split(training_data, test_size=0.2, random_state=42)

print(training_data.columns)

Index(['log_id', 'label', 'user_id', 'age', 'gender', 'residence', 'city',
       'city_rank', 'series_dev', 'series_group', 'emui_dev', 'device_name',
       'device_size', 'net_type', 'task_id', 'adv_id', 'creat_type_cd',
       'adv_prim_id', 'inter_type_cd', 'slot_id', 'site_id', 'spread_app_id',
       'hispace_app_tags', 'app_second_class', 'app_score',
       'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003',
       'ad_close_list_v001', 'ad_close_list_v002', 'ad_close_list_v003',
       'pt_d', 'u_newsCatInterestsST', 'u_refreshTimes', 'u_feedLifeCycle',
       'total_clicks', 'avg_refresh_times', 'total_dislikes', 'total_upvotes',
       'unique_news_categories', 'most_common_category', 'ctr_news',
       'category_diversity'],
      dtype='object')


In [27]:
training_data.iloc[:, 0:20].head(5)

Unnamed: 0,log_id,label,user_id,age,gender,residence,city,city_rank,series_dev,series_group,emui_dev,device_name,device_size,net_type,task_id,adv_id,creat_type_cd,adv_prim_id,inter_type_cd,slot_id
56561,92605,0,277180,2,2,13,191,4,34,7,21,115,2208,7,27317,13170,8,1611,4,12
14226,85291,0,193254,6,4,33,319,3,27,2,11,351,2117,7,30492,16454,8,1585,5,59
65732,264467,0,211766,8,2,20,170,4,30,3,13,265,2353,7,11209,11433,8,1041,5,50
51265,117391,0,109748,5,2,20,372,5,16,5,20,334,2117,7,23442,11310,5,1036,4,53
9802,827949,0,150768,2,2,20,372,5,16,5,20,334,2541,7,34382,11752,10,2066,4,16


In [24]:
training_data.iloc[:, 20:40].head(5)

Unnamed: 0,site_id,spread_app_id,hispace_app_tags,app_second_class,app_score,ad_click_list_v001,ad_click_list_v002,ad_click_list_v003,ad_close_list_v001,ad_close_list_v002,ad_close_list_v003,pt_d,u_newsCatInterestsST,u_refreshTimes,u_feedLifeCycle,total_clicks,avg_refresh_times,total_dislikes,total_upvotes,unique_news_categories
56561,1,372,20,13,10.0,34164^22937^10548^20418^31476,1041^1518^1669^1296^1005,114^344^309^168^372,24107,1218,173,202206041136,98^8^44^78^109,9,17,82,9.0,207,351,26
14226,1,190,47,14,10.0,13726^31390^34382^30779^17008,1613^1080^2066^1549^1061,212^190^114^304^332,24107,1218,173,202206030101,199^173^7^8^0,7,17,34,7.0,63,171,27
65732,1,246,47,14,10.0,12999^10532^28594^28630^16770,1557^1875^1535^1585^1106,190^168^240^162,24107,1218,173,202206060241,98^17^98^78^98,6,17,43,6.0,135,297,16
51265,1,312,43,18,10.0,19502^16054^11911^14265^25802,1098^1036^1037,206^312^327,24107,1218,173,202206090644,157^39^169^62^199,2,11,6,2.0,13,17,6
9802,1,114,43,18,10.0,35368^14787^31470^28328^20641,2030^1112^1041^1612^2012,246^250,13997^35520^24055^14787^17020,1097^1913^1112^1036^1498,312^280^196^152^206,202206030724,27^27^95^27^48,9,17,135,9.0,50,84,16


In [26]:
training_data.iloc[:, 40:44].head(5)

Unnamed: 0,most_common_category,ctr_news,category_diversity
56561,98,0.987952,0.42623
14226,0,0.971429,0.442623
65732,98,0.977273,0.262295
51265,39,0.857143,0.098361
9802,27,0.992647,0.262295


In [29]:
numeric_features = training_data.select_dtypes(include='number').columns.tolist()

X_train_numeric = training_data[numeric_features].drop(columns=['label'])
y_train_numeric = training_data[numeric_features]['label']

X_validation_numeric = validation_data[numeric_features].drop(columns=['label'])
y_validation_numeric = validation_data[numeric_features]['label']

X_test_numeric = test_data[numeric_features].drop(columns=['label'])
y_test_numeric = test_data[numeric_features]['label']

# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from sklearn.mixture import GaussianMixture

In [41]:
# 1. Vanilla SMOTE - uses KNN
smote_knn = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled_knn, y_train_resampled_knn = smote_knn.fit_resample(X_train_numeric, y_train_numeric)

print("Original Distribution:", np.bincount(y_train_numeric))
print("SMOTE-KNN Distribution:", np.bincount(y_train_resampled_knn))

Original Distribution: [48324   799]
SMOTE-KNN Distribution: [48324 48324]


In [43]:
# 2. SMOTE_SVM - uses SVM instead of KNN
smote_svm = SVMSMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled_svm, y_train_resampled_svm = smote_svm.fit_resample(X_train_numeric, y_train_numeric)

print("Original Distribution:", np.bincount(y_train_numeric))
print("SMOTE-SVM Distribution:", np.bincount(y_train_resampled_svm))

Original Distribution: [48324   799]
SMOTE-SVM Distribution: [48324 26935]


In [45]:
# 3. SMOTE GMM - uses Gaussian Mixture Modeling

# Fit a GMM on minority class samples
minority_data = X_train_numeric[y_train_numeric == 1]
minority_label = 1
majority_label = 0

gmm = GaussianMixture(n_components=3, random_state=42)  # n_components may need tuning
gmm.fit(minority_data)

# Generate synthetic samples
n_synthetic_samples = sum(y_train_numeric == majority_label) - sum(y_train_numeric == minority_label)
synthetic_samples, _ = gmm.sample(n_synthetic_samples)

# Append synthetic samples to training data
X_train_resampled_gmm = np.vstack([X_train_numeric, synthetic_samples])
y_train_resampled_gmm = np.hstack([y_train_numeric, [minority_label] * n_synthetic_samples])

print("Original Distribution:", np.bincount(y_train_numeric))
print("SMOTE-GMM Distribution:", np.bincount(y_train_resampled_gmm))

  rng.multivariate_normal(mean, covariance, int(sample))


Original Distribution: [48324   799]
SMOTE-GMM Distribution: [48324 48324]
