In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy import stats

In [None]:

class FeatureEngineer:
    def __init__(self):
        self.scaler = StandardScaler()
        self.poly = PolynomialFeatures(degree=2, include_bias=False)
        self.pca = PCA(n_components=0.95)
        self.selector = SelectKBest(f_classif, k='all')
    

    def create_interaction_features(self, X):
        num_cols = X.select_dtypes(include=['float64', 'int64']).columns
        interactions = pd.DataFrame()
        
        for i, col1 in enumerate(num_cols):
            for col2 in num_cols[i+1:]:
                interactions[f'{col1}_{col2}_mult'] = X[col1] * X[col2]
                interactions[f'{col1}_{col2}_div'] = X[col1] / (X[col2] + 1e-8)
                
        return interactions

    def fit_transform(self, X, y):
        interactions = self.create_interaction_features(X)
        X_combined = pd.concat([X, interactions], axis=1)
        
        X_scaled = self.scaler.fit_transform(X_combined)
        X_scaled = pd.DataFrame(X_scaled, columns=X_combined.columns)
        
        X_poly = self.poly.fit_transform(X_scaled)
        poly_features = pd.DataFrame(X_poly, columns=[f'poly_{i}' for i in range(X_poly.shape[1])])
        
        X_pca = self.pca.fit_transform(X_scaled)
        pca_features = pd.DataFrame(X_pca, columns=[f'pca_{i}' for i in range(X_pca.shape[1])])
        
        final_features = pd.concat([X_scaled, poly_features, pca_features], axis=1)
        
        self.selector.fit(final_features, y)
        selected_mask = self.selector.get_support()
        selected_features = final_features.iloc[:, selected_mask]
        self.feature_names = selected_features.columns.tolist()
        
        return selected_features

    def transform(self, X):
        interactions = self.create_interaction_features(X)
        X_combined = pd.concat([X, interactions], axis=1)
        
        X_scaled = self.scaler.transform(X_combined)
        X_scaled = pd.DataFrame(X_scaled, columns=X_combined.columns)
        
        X_poly = self.poly.transform(X_scaled)
        poly_features = pd.DataFrame(X_poly, columns=[f'poly_{i}' for i in range(X_poly.shape[1])])
        
        X_pca = self.pca.transform(X_scaled)
        pca_features = pd.DataFrame(X_pca, columns=[f'pca_{i}' for i in range(X_pca.shape[1])])
        
        final_features = pd.concat([X_scaled, poly_features, pca_features], axis=1)
        return final_features[self.feature_names]




In [None]:
def evaluate_model(y_true, y_pred, y_prob):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_prob)
    }

def train_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    metrics = evaluate_model(y_test, y_pred, y_prob)
    print(f"\n{model_name} Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    return model, metrics

def fine_tune_lgbm(X_train, y_train, X_test, y_test):
    model = LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        num_leaves=31,
        feature_fraction=0.8
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='auc',
        early_stopping_rounds=10,
        verbose=False
    )
    
    # best_iteration = model.best_iteration_
    # final_model = LGBMClassifier(
    #     n_estimators=best_iteration,
    #     learning_rate=0.05,
    #     num_leaves=min(31, int(best_iteration/5)),
    #     feature_fraction=0.7,
    #     bagging_fraction=0.8,
    #     bagging_freq=5
    # )
    
    return model


In [None]:
def run_ml_pipeline(X_train, X_test, y_train, y_test):
    # Feature engineering
    fe = FeatureEngineer()
    X_train_engineered = fe.fit_transform(X_train, y_train)
    X_test_engineered = fe.transform(X_test)
    
    # 1. CatBoost with cross-validation
    catboost_params = {
        'iterations': 1000,
        'learning_rate': 0.1,
        'depth': 6,
        'l2_leaf_reg': 3,
        'eval_metric': 'AUC',
        'verbose': False
    }
    
    catboost = CatBoostClassifier(**catboost_params)
    cv_scores = cross_validate(catboost, X_train_engineered, y_train, 
                             cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
    
    print("\nCatBoost Cross-Validation Results:")
    for metric, scores in cv_scores.items():
        if metric.startswith('test'):
            print(f"{metric}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    
    catboost_model, catboost_metrics = train_evaluate_model(
        catboost, X_train_engineered, X_test_engineered, y_train, y_test, "CatBoost"
    )
    
    # 2. XGBoost with GridSearch
    xgb_param_grid = {
        'max_depth': [3],
        'learning_rate': [0.01],
        'n_estimators': [100],
        'subsample': [0.8]
    }
    
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    grid_search = GridSearchCV(xgb, xgb_param_grid, cv=5, scoring='roc_auc')
    grid_search.fit(X_train_engineered, y_train)
    
    print("\nXGBoost Best Parameters:", grid_search.best_params_)
    xgb_model, xgb_metrics = train_evaluate_model(
        grid_search.best_estimator_, X_train_engineered, X_test_engineered, 
        y_train, y_test, "XGBoost"
    )
    
    # 3. LightGBM with custom fine-tuning
    lgbm_model = fine_tune_lgbm(X_train_engineered, y_train, X_test_engineered, y_test)
    lgbm_model, lgbm_metrics = train_evaluate_model(
        lgbm_model, X_train_engineered, X_test_engineered, y_train, y_test, "LightGBM"
    )
    
    # Compare models
    all_metrics = {
        'CatBoost': catboost_metrics,
        'XGBoost': xgb_metrics,
        'LightGBM': lgbm_metrics
    }
    
    metrics_df = pd.DataFrame(all_metrics).round(4)
    print("\nModel Comparison:")
    print(metrics_df)
    
    # Return best model
    best_model_name = metrics_df.loc['roc_auc'].idxmax()
    best_model = {
        'CatBoost': catboost_model,
        'XGBoost': xgb_model,
        'LightGBM': lgbm_model
    }[best_model_name]
    
    print(f"\nBest performing model: {best_model_name}")
    return best_model, fe, metrics_df

In [2]:
# Load data

train = pd.read_csv('../data/train/train.csv')
test = pd.read_csv('../data/test/test.csv')

In [3]:
y_train = train['like']
X_train = train.drop(columns=['like'])

y_test = test['like']
X_test = test.drop(columns=['like'])



In [None]:

best_model, feature_engineer, metrics_df = run_ml_pipeline(X_train, X_test, y_train, y_test)