Regression Spline Exploration

In [3]:
# Imports
!pip install pygam
import pandas as pd
import numpy as np
import statsmodels as sm
import operator
from functools import reduce
from pygam import LogisticGAM, s
from patsy import dmatrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, SplineTransformer, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, make_scorer, f1_score



In [4]:
# Load data
df = pd.read_csv("train.csv")
X = df.drop(columns=["Y"])
y = df["Y"]
test = pd.read_csv("test.csv")

# Bootstrap Sample Creation 
X_0 = X[y == 0] 
X_1 = X[y == 1] 
num = len(X_0)
X_1_boot = X_1.sample(num, replace = True, random_state = 32)
X_boot = pd.concat([X_0, X_1_boot], axis = 0)
y_boot = np.concatenate([np.zeros(num), np.ones(num)])


In [9]:
# B-Spline Logistic Regression 
spline = SplineTransformer(degree=4, n_knots=4, include_bias=False)
model = Pipeline([('spline', spline), ('logistic', LogisticRegression(penalty='l2', C=8, max_iter = 1000))])

scores = cross_val_score(model, X, y, cv=5, scoring = 'f1')
print("B-Spline Logistic Regression F1 scores:", scores)

# With Bootstrap 
scores = cross_val_score(model, X_boot, y_boot, cv=5, scoring = 'f1')
print("Bootstrapped B-Spline Logistic Regression F1 scores:", scores)

Logistic Regression F1 scores: [0.3030303  0.32106038 0.31065089 0.29411765 0.33576642]
Bootstrapped Logistic Regression F1 scores: [0.92321065 0.9258455  0.92319562 0.92446798 0.92692798]


In [14]:
# # Polynomial Logistic Regression 
# spline = PolynomialFeatures(degree=3, include_bias=False)
# model = Pipeline([('poly', spline), ('logistic', LogisticRegression(penalty='l2', C=11, max_iter = 1000))])

# scores = cross_val_score(model, X, y, cv=5, scoring = 'f1')
# print("Cross-validation F1 scores:", scores)

# # With Bootstrap
# scores = cross_val_score(model, X_boot, y_boot, cv=5, scoring = 'f1')
# print("Bootstrapped Cross-validation F1 scores:", scores)

# Submission Creation 
model.fit(X_boot, y_boot)
test = pd.read_csv("test.csv") 
y_pred = model.predict(test[X_boot.columns]) 

submission = pd.DataFrame({
    'ID': test['ID'] , 
    'Y': y_pred })
submission.to_csv("solution.csv", index = False)


In [5]:
# GAM with Natural Splines
numeric_cols = X.columns.tolist()
spline = SplineTransformer(degree = 3, n_knots = 5, include_bias = False) 
pre = ColumnTransformer([('spline_all', spline, numeric_cols)], remainder = 'drop')
pipe = Pipeline([
    ('spline', pre), 
    ('scaler', StandardScaler()), 
    ('clf', LogisticRegression(penalty= 'l2', C = 1, solver = 'lbfgs', max_iter = 1000, class_weight = 'balanced'))])

param_grid = {
    'spline__spline_all__n_knots': [2,3,4,5,6], 
    'clf__C': [0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = 'f1', n_jobs = -1) 
grid.fit(X,y)
print("Best mean CV F1: ", grid.best_score_)

Best mean CV F1:  0.3537714142675027


In [7]:
# Natural Spline GAM With Bootstrap 
numeric_cols = X_boot.columns.tolist()
spline = SplineTransformer(degree = 3, n_knots = 5, include_bias = False) 
pre = ColumnTransformer([('spline_all', spline, numeric_cols)], remainder = 'drop')
pipe = Pipeline([
    ('spline', pre), 
    ('scaler', StandardScaler()), 
    ('clf', LogisticRegression(penalty= 'l2', C = 1, solver = 'lbfgs', max_iter = 1000, class_weight = 'balanced'))])

param_grid = {
    'spline__spline_all__n_knots': [2,3,4,5,6], 
    'clf__C': [0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv = 5, scoring = 'f1', n_jobs = -1) 
grid.fit(X_boot,y_boot)
print("Best mean CV F1: ", grid.best_score_)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [7]:
# Logistic GAM 
terms = [s(i, n_splines = 10) for i in range(X.shape[1])]
gam = LogisticGAM(reduce(operator.add, terms))

def gam_cv(model, X, y, cv = 5): 
    kf = KFold(n_splits = cv, shuffle = True, random_state = 42) 
    scores = []
    for train_idx, test_idx in kf.split(X): 
        model.fit(X[train_idx], y[train_idx]) 
        preds = model.predict(X[test_idx])
        scores.append(f1_score(y[test_idx], preds))
    return np.mean(scores)

print("Logistic Gam F1 score:", gam_cv(gam, X.values, y.values))

# With Bootstrap 


Logistic Gam F1 score: 0.35505980939946086


In [5]:
# Logistic GAM with lambda tuning 
terms = [s(i, n_splines = 10) for i in range(X.shape[1])]
gam = LogisticGAM(reduce(operator.add, terms))

gam.gridsearch(X.values, y.values) 
preds = gam.predict(X.values) 
f1 = f1_score(y.values, preds) 
print("Best F1 Score with Tuning:", f1)

[38;2;211;255;0m 72%[39m [38;2;211;255;0m(8 of 11)[39m |##################       | Elapsed Time: 0:16:03 ETA:   0:06:01

KeyboardInterrupt: 