In [3]:
%pip install -q dagshub mlflow

Note: you may need to restart the kernel to use updated packages.


# Import/setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from sklearn.base import BaseEstimator, TransformerMixin
import mlflow
import mlflow.sklearn
import os
import gc
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

os.environ['MLFLOW_TRACKING_URI'] = 'https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow'
os.environ['MLFLOW_TRACKING_USERNAME'] = 'g-kitiashvili'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '1c2227158cc19daf66bb3b241116a8e8c5f1cd20' 

model_name = "LogisticRegression" 
mlflow.set_experiment(f"{model_name}_Training")




# Custom Transformers for Pipeline

In [10]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, k=100):
        self.k = k
        self.selector = None
        self.selected_feature_indices = None
        
    def fit(self, X, y=None):
        self.selector = SelectKBest(f_classif, k=min(self.k, X.shape[1]))
        self.selector.fit(X, y)
        self.selected_feature_indices = self.selector.get_support()
        return self
        
    def transform(self, X):
        return self.selector.transform(X)

class MissingValueHandler(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=50):
        self.threshold = threshold
        self.high_missing_cols = None
        self.constant_features = None
        self.medians = {}
        self.modes = {}
        
    def fit(self, X, y=None):
        # Identify columns with too many missing values
        missing_percent = (X.isnull().mean() * 100)
        self.high_missing_cols = missing_percent[missing_percent > self.threshold].index.tolist()
        
        # Identify constant features
        self.constant_features = [col for col in X.columns if X[col].nunique() <= 1]
        
        # Calculate median for numerical features
        numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
        for col in numeric_cols:
            if col not in self.high_missing_cols and col not in self.constant_features:
                self.medians[col] = X[col].median()
        
        # Calculate mode for categorical features
        categorical_cols = X.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            if col not in self.high_missing_cols and col not in self.constant_features:
                self.modes[col] = X[col].mode()[0] if not X[col].mode().empty else 'missing'
        
        return self
        
    def transform(self, X):
        X_new = X.copy()
        
        # Remove high missing columns
        X_new = X_new.drop(columns=self.high_missing_cols, errors='ignore')
        
        # Remove constant features
        X_new = X_new.drop(columns=self.constant_features, errors='ignore')
        
        # Fill missing values in numerical features
        for col, median in self.medians.items():
            if col in X_new.columns:
                X_new[col] = X_new[col].fillna(median)
        
        # Fill missing values in categorical features
        for col, mode in self.modes.items():
            if col in X_new.columns:
                X_new[col] = X_new[col].fillna(mode)
        
        return X_new

class OutlierHandler(BaseEstimator, TransformerMixin):
    
    def __init__(self, q_low=0.01, q_high=0.99):
        self.q_low = q_low
        self.q_high = q_high
        self.bounds = {}
        
    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
        for col in numeric_cols:
            if 'TransactionDT' not in col and col in X.columns:
                q1 = X[col].quantile(self.q_low)
                q3 = X[col].quantile(self.q_high)
                self.bounds[col] = (q1, q3)
        return self
        
    def transform(self, X):
        X_new = X.copy()
        for col, (q1, q3) in self.bounds.items():
            if col in X_new.columns:
                X_new[col] = np.clip(X_new[col], q1, q3)
        return X_new

class DatetimeFeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X_new = X.copy()
        
        if 'TransactionDT' in X_new.columns:
            # Convert to days
            X_new['TransactionDay'] = X_new['TransactionDT'] / (24 * 60 * 60)
            
            # Create cyclical features for day of week
            X_new['DayOfWeek_sin'] = np.sin(2 * np.pi * (X_new['TransactionDay'] % 7) / 7)
            X_new['DayOfWeek_cos'] = np.cos(2 * np.pi * (X_new['TransactionDay'] % 7) / 7)
            
            # Create hour of day
            X_new['Hour'] = (X_new['TransactionDT'] % (24 * 60 * 60)) / 3600
            
            # Cyclical features for hour
            X_new['Hour_sin'] = np.sin(2 * np.pi * X_new['Hour'] / 24)
            X_new['Hour_cos'] = np.cos(2 * np.pi * X_new['Hour'] / 24)
            
            # Drop original
            X_new = X_new.drop(['TransactionDT', 'Hour'], axis=1)
        
        return X_new

class LogTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X_new = X.copy()
        
        if 'TransactionAmt' in X_new.columns:
            X_new['TransactionAmt_Log'] = np.log1p(X_new['TransactionAmt'])
        
        return X_new

class CategoryEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_categories=20):
        self.max_categories = max_categories
        self.encodings = {}
        self.frequency_maps = {}
        
    def fit(self, X, y=None):
        categorical_cols = X.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            if X[col].nunique() < self.max_categories:
                # Get dummies for low cardinality
                dummies = pd.get_dummies(X[col], prefix=col, drop_first=True)
                self.encodings[col] = dummies.columns.tolist()
            else:
                # Frequency encoding for high cardinality
                self.frequency_maps[col] = X[col].value_counts(normalize=True).to_dict()
        
        return self
        
    def transform(self, X):
        X_new = X.copy()
        
        # Apply encodings
        for col, dummy_cols in self.encodings.items():
            if col in X_new.columns:
                # Create one-hot encoding
                dummies = pd.get_dummies(X_new[col], prefix=col, drop_first=True)
                
                # Ensure all expected columns exist
                for dummy_col in dummy_cols:
                    if dummy_col not in dummies.columns:
                        dummies[dummy_col] = 0
                
                # Only keep columns from training
                dummies = dummies[dummy_cols]
                
                # Add to dataframe
                X_new = pd.concat([X_new, dummies], axis=1)
                
                # Drop original
                X_new = X_new.drop(columns=[col])
        
        # Apply frequency encoding
        for col, freq_map in self.frequency_maps.items():
            if col in X_new.columns:
                X_new[f'{col}_freq'] = X_new[col].map(freq_map).fillna(0)
                X_new = X_new.drop(columns=[col])
        
        return X_new

class InteractionFeatureCreator(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.interaction_features = []
        self.card_cols = []
        self.addr_cols = []
        self.email_cols = []
        
    def _clean_feature_name(self, name):
        if isinstance(name, str):
            # Replace spaces, special chars with underscores
            return name.replace(' ', '_').replace('-', '_').replace('/', '_')
        return str(name)
    
    def fit(self, X, y=None):
        # Store the original column lists for later use
        self.card_cols = [col for col in X.columns if 'card' in col.lower() 
                     and X[col].dtype != 'object' and not pd.api.types.is_bool_dtype(X[col])]
        
        self.addr_cols = [col for col in X.columns if 'addr' in col.lower() 
                     and X[col].dtype != 'object' and not pd.api.types.is_bool_dtype(X[col])]
        
        self.email_cols = [col for col in X.columns if 'email' in col.lower() 
                      and X[col].dtype != 'object' and not pd.api.types.is_bool_dtype(X[col])]
        
        # Pre-compute which interaction features will be created
        self.interaction_features = []
        
        # Card interactions
        if len(self.card_cols) >= 2:
            for i in range(min(3, len(self.card_cols)-1)):
                for j in range(i+1, min(i+3, len(self.card_cols))):
                    col_name = f"{self.card_cols[i]}_x_{self.card_cols[j]}"
                    self.interaction_features.append(col_name)
        
        # Address interactions
        if len(self.addr_cols) >= 2:
            for i in range(min(3, len(self.addr_cols)-1)):
                for j in range(i+1, min(i+3, len(self.addr_cols))):
                    col_name = f"{self.addr_cols[i]}_x_{self.addr_cols[j]}"
                    self.interaction_features.append(col_name)
        
        # Email interactions
        if len(self.email_cols) >= 2:
            for i in range(len(self.email_cols)-1):
                for j in range(i+1, len(self.email_cols)):
                    col_name = f"{self.email_cols[i]}_x_{self.email_cols[j]}"
                    self.interaction_features.append(col_name)
        
        return self
        
    def transform(self, X):
        X_new = X.copy()
        
        # Create card interactions
        if len(self.card_cols) >= 2:
            for i in range(min(3, len(self.card_cols)-1)):
                for j in range(i+1, min(i+3, len(self.card_cols))):
                    # Only create features that were in the training set
                    col_name = f"{self.card_cols[i]}_x_{self.card_cols[j]}"
                    if col_name in self.interaction_features:
                        # Ensure both columns exist in X
                        if self.card_cols[i] in X_new.columns and self.card_cols[j] in X_new.columns:
                            X_new[col_name] = X_new[self.card_cols[i]] * X_new[self.card_cols[j]]
        
        # Create addr interactions
        if len(self.addr_cols) >= 2:
            for i in range(min(3, len(self.addr_cols)-1)):
                for j in range(i+1, min(i+3, len(self.addr_cols))):
                    col_name = f"{self.addr_cols[i]}_x_{self.addr_cols[j]}"
                    if col_name in self.interaction_features:
                        if self.addr_cols[i] in X_new.columns and self.addr_cols[j] in X_new.columns:
                            X_new[col_name] = X_new[self.addr_cols[i]] * X_new[self.addr_cols[j]]
        
        # Create email domain interactions
        if len(self.email_cols) >= 2:
            for i in range(len(self.email_cols)-1):
                for j in range(i+1, len(self.email_cols)):
                    col_name = f"{self.email_cols[i]}_x_{self.email_cols[j]}"
                    if col_name in self.interaction_features:
                        if self.email_cols[i] in X_new.columns and self.email_cols[j] in X_new.columns:
                            X_new[col_name] = X_new[self.email_cols[i]] * X_new[self.email_cols[j]]
        
        return X_new

class CorrelationFilter(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.drop_cols = None
        
    def fit(self, X, y=None):
        # Calculate correlation matrix
        corr_matrix = X.corr().abs()
        
        # Get upper triangle
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        # Find features with correlation greater than threshold
        self.drop_cols = [column for column in upper.columns if any(upper[column] > self.threshold)]
        
        return self
        
    def transform(self, X):
        return X.drop(columns=self.drop_cols, errors='ignore')


 # Data Loading and Preparation


In [11]:
print("Loading data...")

train_transaction = pd.read_csv('./data/train_transaction.csv')
test_transaction = pd.read_csv('./data/test_transaction.csv')

train_identity = pd.read_csv('./data/train_identity.csv')
test_identity = pd.read_csv('./data/test_identity.csv')

print(f"Train transaction shape: {train_transaction.shape}")
print(f"Test transaction shape: {test_transaction.shape}")
print(f"Train identity shape: {train_identity.shape}")
print(f"Test identity shape: {test_identity.shape}")

with mlflow.start_run(run_name=f"{model_name}_Initial_Preparation") as run:
    print("Merging data...")
    
    train = train_transaction.merge(train_identity, on='TransactionID', how='left')
    test = test_transaction.merge(test_identity, on='TransactionID', how='left')
    
    mlflow.log_param("train_original_shape", train.shape)
    mlflow.log_param("test_original_shape", test.shape)
    
    del train_transaction, train_identity
    gc.collect()
    
    target = 'isFraud'
    y_train = train[target].copy()
    train_transaction_id = train['TransactionID'].copy()
    test_transaction_id = test['TransactionID'].copy()
    
    fraud_ratio = y_train.mean()
    mlflow.log_param("fraud_ratio", fraud_ratio)
    print(f"Fraud ratio: {fraud_ratio:.4f}")
    
    X_train = train.drop(['isFraud'], axis=1)
    
    del train
    gc.collect()

Loading data...
Train transaction shape: (590540, 394)
Test transaction shape: (506691, 393)
Train identity shape: (144233, 41)
Test identity shape: (141907, 41)
Merging data...
Fraud ratio: 0.0350
🏃 View run LogisticRegression_Initial_Preparation at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1/runs/722b5878acff4c03ab3c6ce62a9a1a7f
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1


 
 # Data Exploration for Pipeline Development


In [12]:

with mlflow.start_run(run_name=f"{model_name}_Exploration") as run:
    categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
    numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numerical_features.remove('TransactionID')  # Remove ID column
    
    mlflow.log_param("categorical_features_count", len(categorical_features))
    mlflow.log_param("numerical_features_count", len(numerical_features))
    
    missing_values = X_train.isnull().mean() * 100
    high_missing_cols = missing_values[missing_values > 50].index.tolist()
    
    mlflow.log_param("high_missing_cols_count", len(high_missing_cols))
    
    mlflow.log_param("transaction_amount_mean", X_train['TransactionAmt'].mean())
    mlflow.log_param("transaction_amount_std", X_train['TransactionAmt'].std())
    
    print(f"Categorical features: {len(categorical_features)}")
    print(f"Numerical features: {len(numerical_features)}")
    print(f"High missing columns: {len(high_missing_cols)}")

Categorical features: 31
Numerical features: 401
High missing columns: 214
🏃 View run LogisticRegression_Exploration at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1/runs/5f8d83e032df43758700ce6e977f7f1a
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1


# Creating pipeline

In [13]:
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------

with mlflow.start_run(run_name=f"{model_name}_Pipeline_Testing") as run:
    
    print("Testing MissingValueHandler...")
    missing_handler = MissingValueHandler(threshold=50)
    missing_handler.fit(X_train)
    X_sample = missing_handler.transform(X_train.iloc[:1000])
    mlflow.log_param("missing_handler_removed_cols", len(missing_handler.high_missing_cols) + len(missing_handler.constant_features))
    
    print("Testing DatetimeFeatureTransformer...")
    dt_transformer = DatetimeFeatureTransformer()
    X_sample = dt_transformer.transform(X_sample)
    
    print("Testing LogTransformer...")
    log_transformer = LogTransformer()
    X_sample = log_transformer.transform(X_sample)
    
    print("Testing OutlierHandler...")
    outlier_handler = OutlierHandler()
    outlier_handler.fit(X_train)
    X_sample = outlier_handler.transform(X_sample)
    
    print("Testing CategoryEncoder...")
    cat_encoder = CategoryEncoder()
    cat_encoder.fit(X_train)
    X_sample = cat_encoder.transform(X_sample)
    
    print("Testing InteractionFeatureCreator...")
    interaction_creator = InteractionFeatureCreator()
    X_sample = interaction_creator.transform(X_sample)
    
    print("Testing CorrelationFilter...")
    corr_filter = CorrelationFilter()
    try:
        corr_filter.fit(X_sample)
        X_sample = corr_filter.transform(X_sample)
    except Exception as e:
        print(f"CorrelationFilter test error: {e}")
    
    print("Testing FeatureSelector...")
    feature_selector = FeatureSelector(k=100)
    try:
        y_sample = y_train.iloc[:1000]
        feature_selector.fit(X_sample, y_sample)
        X_sample = feature_selector.transform(X_sample)
    except Exception as e:
        print(f"FeatureSelector test error: {e}")
    
    mlflow.log_param("pipeline_testing_successful", True)
    mlflow.log_param("sample_features_after_transforms", X_sample.shape[1])

    
def create_logistic_regression_pipeline(C=1.0, penalty='l2', class_weight='balanced', 
                                      solver='saga', l1_ratio=0.5, max_iter=1000):
    preprocessor = Pipeline([
        ('missing_handler', MissingValueHandler(threshold=50)),
        ('datetime_transformer', DatetimeFeatureTransformer()),
        ('log_transformer', LogTransformer()),
        ('outlier_handler', OutlierHandler(q_low=0.01, q_high=0.99)),
        ('category_encoder', CategoryEncoder(max_categories=20)),
        ('interaction_creator', InteractionFeatureCreator()),
        ('corr_filter', CorrelationFilter(threshold=0.9)),
    ])
    
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        class_weight=class_weight,
        solver=solver,
        l1_ratio=l1_ratio if penalty == 'elasticnet' else None,
        max_iter=max_iter,
        random_state=42,
        n_jobs=-1
    )
    
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('feature_selector', FeatureSelector(k=100)),
        ('classifier', model)
    ])
    
    return full_pipeline

Testing MissingValueHandler...
Testing DatetimeFeatureTransformer...
Testing LogTransformer...
Testing OutlierHandler...
Testing CategoryEncoder...
Testing InteractionFeatureCreator...
Testing CorrelationFilter...
Testing FeatureSelector...
🏃 View run LogisticRegression_Pipeline_Testing at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1/runs/605b01dd54b3408c805346bcca98d49b
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1





# Data Preprocessing and training

In [14]:
def train(X_train, y_train, X_val=None, y_val=None, C=1.0, penalty='l2', max_iter=1000):
    
    preprocessor = Pipeline([
        ('missing_handler', MissingValueHandler(threshold=50)),
        ('datetime_transformer', DatetimeFeatureTransformer()),
        ('log_transformer', LogTransformer()),
        ('outlier_handler', OutlierHandler(q_low=0.01, q_high=0.99)),
        ('category_encoder', CategoryEncoder(max_categories=20)),
        ('interaction_creator', InteractionFeatureCreator()),
        ('corr_filter', CorrelationFilter(threshold=0.9)),
    ])
    
    print("Preprocessing data with sklearn pipeline...")
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val) if X_val is not None else None
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_preprocessed)
    X_val_scaled = scaler.transform(X_val_preprocessed) if X_val_preprocessed is not None else None
    
    selector = FeatureSelector(k=100)
    X_train_selected = selector.fit_transform(X_train_scaled, y_train)
    X_val_selected = selector.transform(X_val_scaled) if X_val_scaled is not None else None
    

        
    model = LogisticRegression(
            C=C,
            penalty=penalty if penalty != 'elasticnet' else 'l1',  # elasticnet not supported without saga
            solver='saga' if penalty == 'elasticnet' else 'liblinear',
            l1_ratio=0.5 if penalty == 'elasticnet' else None,
            max_iter=max_iter,
            random_state=42
        )
    model.fit(X_train_selected, y_train)
        
    if X_val_selected is not None:
        y_val_pred = model.predict_proba(X_val_selected)[:, 1]
    
    if X_val_selected is not None:
        val_auc = roc_auc_score(y_val, y_val_pred)
        precision, recall, _ = precision_recall_curve(y_val, y_val_pred)
        pr_auc = auc(recall, precision)
        
        print(f"Validation AUC: {val_auc:.4f}")
        print(f"Validation PR-AUC: {pr_auc:.4f}")
        
        return model, preprocessor, scaler, selector, val_auc, pr_auc
    
    return model, preprocessor, scaler, selector, None, None

# CrossValidation and tuning

In [15]:
with mlflow.start_run(run_name=f"{model_name}_Cross_Validation") as run:
    X_train_cv, X_val, y_train_cv, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    mlflow.log_param("validation_split", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("train_size", X_train_cv.shape[0])
    mlflow.log_param("validation_size", X_val.shape[0])
    
    model, preprocessor, scaler, selector, val_auc, pr_auc = train(
        X_train_cv, y_train_cv, X_val, y_val, C=1.0, penalty='l2', max_iter=1000
    )
    
    mlflow.log_metric("validation_auc", val_auc)
    mlflow.log_metric("validation_pr_auc", pr_auc)
    
    mlflow.log_param("model_type", "cuML_LogisticRegression")
    mlflow.log_param("C", 1.0)
    mlflow.log_param("penalty", "l2")
    mlflow.log_param("max_iter", 1000)
    
    print(f"GPU-accelerated training complete.")
    print(f"AUC: {val_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    


with mlflow.start_run(run_name=f"{model_name}_Hyperparameter_Tuning") as run:
    C_values = [0.01, 0.1, 1.0, 10.0]
    
    mlflow.log_param("tuning_C_values", C_values)
    
    X_tune, X_val_tune, y_tune, y_val_tune = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )
    
    best_auc = 0
    best_C = None
    best_components = None
    
    for C in C_values:
        print(f"Testing C={C}...")
        
        model, preprocessor, scaler, selector, val_auc, pr_auc = train(
            X_tune, y_tune, X_val_tune, y_val_tune, C=C, penalty='l2', max_iter=1000
        )
        
        mlflow.log_metric(f"C_{C}_auc", val_auc)
        mlflow.log_metric(f"C_{C}_pr_auc", pr_auc)
        
        if val_auc > best_auc:
            best_auc = val_auc
            best_C = C
            best_components = (model, preprocessor, scaler, selector)
    
    mlflow.log_param("best_C", best_C)
    mlflow.log_metric("best_cv_auc", best_auc)
    best_params = {
    'classifier__C': best_C,
    'classifier__penalty': 'l2', 
    'classifier__class_weight': 'balanced'  
    
}

    print(f"Best parameters: {best_params}")
    print(f"Best C value: {best_C}")
    print(f"Best CV AUC: {best_auc:.4f}")


Preprocessing data with sklearn pipeline...
Validation AUC: 0.8232
Validation PR-AUC: 0.3216
GPU-accelerated training complete.
AUC: 0.8232
PR AUC: 0.3216
🏃 View run LogisticRegression_Cross_Validation at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1/runs/b5029fd46f074b4d811eea076579a08d
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1
Testing C=0.01...
Preprocessing data with sklearn pipeline...
Validation AUC: 0.8232
Validation PR-AUC: 0.3214
Testing C=0.1...
Preprocessing data with sklearn pipeline...
Validation AUC: 0.8232
Validation PR-AUC: 0.3215
Testing C=1.0...
Preprocessing data with sklearn pipeline...
Validation AUC: 0.8232
Validation PR-AUC: 0.3216
Testing C=10.0...
Preprocessing data with sklearn pipeline...
Validation AUC: 0.8232
Validation PR-AUC: 0.3216
Best parameters: {'classifier__C': 1.0, 'classifier__penalty': 'l2', 'classifier__class_weight': 'balanced'}
Best C value: 1.0
Best CV AUC: 0.8232
🏃

# Final Training with Full Dataset


In [16]:
with mlflow.start_run(run_name=f"{model_name}_Final_Training") as run:
    try:
        final_params = {param.replace('classifier__', ''): value 
                       for param, value in best_params.items()}
    except NameError:
        print("best_params not found. Using default parameters.")
        final_params = {
            'C': 1.0,
            'penalty': 'l2',
            'class_weight': 'balanced'
        }
    
    mlflow.log_params(final_params)
    
  
    final_pipeline = create_logistic_regression_pipeline(
        C=final_params.get('C', 1.0),
        penalty=final_params.get('penalty', 'l2'),
        class_weight=final_params.get('class_weight', 'balanced'),
        solver='saga',
        l1_ratio=0.5 if final_params.get('penalty') == 'elasticnet' else None,
        max_iter=1000
    )
    
    print("Training final model on full dataset...")
    final_pipeline.fit(X_train, y_train)
    
    try:
        print("Testing pipeline on test data...")
        
        original_test_columns = test.columns.tolist()
        
        columns_to_keep = [col for col in original_test_columns if col in X_train.columns]
        test_matched = test[columns_to_keep]
        
        for col in X_train.columns:
            if col not in test_matched.columns:
                if X_train[col].dtype in ['int64', 'float64']:
                    test_matched[col] = X_train[col].median()
                else:
                    test_matched[col] = X_train[col].mode()[0]
        
        test_probs = final_pipeline.predict_proba(test_matched)[:, 1]
        
    except Exception as e:
        print(f"Error during prediction: {e}")
        print("Falling back to basic prediction approach")
        simple_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(C=1.0, max_iter=1000))
        ])
        
        numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
        X_train_simple = X_train[numeric_cols].copy()
        test_simple = test[numeric_cols.intersection(test.columns)].copy()
        
        for col in numeric_cols:
            if col not in test_simple.columns:
                test_simple[col] = 0
        
        test_simple = test_simple[X_train_simple.columns]
        
        X_train_simple.fillna(0, inplace=True)
        test_simple.fillna(0, inplace=True)
        
        simple_pipeline.fit(X_train_simple, y_train)
        test_probs = simple_pipeline.predict_proba(test_simple)[:, 1]
    
    submission = pd.DataFrame({
        'TransactionID': test_transaction_id,
        'isFraud': test_probs
    })
    
    submission_file = f"logistic_regression_submission_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
    submission.to_csv(submission_file, index=False)
    
    mlflow.log_artifact(submission_file)
    
    mlflow.sklearn.log_model(final_pipeline, "final_pipeline")
    
    model_registry_name = f"{model_name}_Pipeline"
    model_description = f"Full {model_name} pipeline including all preprocessing steps"
    
    try:
        mlflow.register_model(
            f"runs:/{run.info.run_id}/final_pipeline",
            model_registry_name,
            tags={"description": model_description}
        )
        print(f"Final model registered as: {model_registry_name}")
    except Exception as e:
        print(f"Error registering model: {e}")
    
    print(f"Submission file saved as: {submission_file}")

Training final model on full dataset...
Testing pipeline on test data...


Registered model 'LogisticRegression_Pipeline' already exists. Creating a new version of this model...
2025/04/23 15:47:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression_Pipeline, version 7
Created version '7' of model 'LogisticRegression_Pipeline'.


Final model registered as: LogisticRegression_Pipeline
Submission file saved as: logistic_regression_submission_20250423_1547.csv
🏃 View run LogisticRegression_Final_Training at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1/runs/801ad630fecf4378848290d423e050f0
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1


# Feature analyisis

In [17]:
with mlflow.start_run(run_name=f"{model_name}_Feature_Analysis") as run:
    try:
        lr_model = final_pipeline.named_steps['classifier']
        
        if hasattr(lr_model, 'coef_'):
            preprocessor = final_pipeline.named_steps['preprocessor']
            X_sample = preprocessor.transform(X_train.head(1))
            
            try:
                feature_names = preprocessor.get_feature_names_out()
            except:
                feature_names = [f"feature_{i}" for i in range(X_sample.shape[1])]
            
            coefficients = np.abs(lr_model.coef_[0])
            
            importance_df = pd.DataFrame({
                'Feature': feature_names,
                'Importance': coefficients
            }).sort_values('Importance', ascending=False)
            
            plt.figure(figsize=(12, 8))
            sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
            plt.title('Top 20 Features by Importance')
            plt.tight_layout()
            
            importance_plot = "logistic_regression_feature_importance.png"
            plt.savefig(importance_plot)
            mlflow.log_artifact(importance_plot)
            
            mlflow.log_param("top_features", importance_df['Feature'].head(20).tolist())
            mlflow.log_param("top_feature_importances", importance_df['Importance'].head(20).tolist())
            
            print("Feature importance analysis complete.")
        else:
            print("Model does not have coefficient attributes.")
            mlflow.log_param("feature_importance_analysis", "model_has_no_coefficients")
    except Exception as e:
        print(f"Feature importance analysis failed: {e}")
        mlflow.log_param("feature_importance_analysis", "failed")

Feature importance analysis failed: All arrays must be of the same length
🏃 View run LogisticRegression_Feature_Analysis at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1/runs/9ddc65cc62814ac0a531a898a91875df
🧪 View experiment at: https://dagshub.com/g-kitiashvili/ML-assignment2.mlflow/#/experiments/1
