In [7]:
from preprocessing import transform_variables

import pandas as pd
import numpy as np
from collections import Counter as cnt
from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,auc

In [8]:
train = pd.read_csv("data/synth/train.csv")
train.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [9]:
y = train["loan_status"]
train_pp = transform_variables(train)

In [10]:
num_features = ["person_age", "log_income", "person_emp_length", "cb_person_cred_hist_length",
                "loan_amnt", "loan_percent_income", "loan_grade_num"]
cat_features = ["person_home_ownership", "loan_intent"]
bool_features = ["cb_person_default_on_file_bool"]

In [13]:
numeric_preprocessor = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)
categorical_preprocessor = Pipeline(
    steps=[
       ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    [
        ("numerical", numeric_preprocessor, num_features),
        ("categorical", categorical_preprocessor, cat_features),
        ("boolean", "passthrough", bool_features)
    ]
    
)


## Training linear models

In [6]:
from sklearn.utils.extmath import softmax
class RidgeClassifierCVwithProba(RidgeClassifierCV):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

In [11]:
pipe_lrcv = make_pipeline(preprocessor, PolynomialFeatures(2), 
                          LogisticRegressionCV(n_jobs=-1, penalty='elasticnet', solver='saga', max_iter=1000,
                                               l1_ratios=[0, 0.01, 0.25, 0.5, 0.75, 0.99, 1]))
pipe_lrcv_weighted = make_pipeline(preprocessor, PolynomialFeatures(2), 
                                   LogisticRegressionCV(n_jobs=-1, penalty='elasticnet', solver='saga', max_iter=1000,
                                                        l1_ratios=[0, 0.01, 0.25, 0.5, 0.75, 0.99, 1],
                                                        class_weight="balanced"))

pipe_rccv = make_pipeline(preprocessor, PolynomialFeatures(2), RidgeClassifierCVwithProba())
pipe_rccv_weighted = make_pipeline(preprocessor, PolynomialFeatures(2), 
                                 RidgeClassifierCVwithProba(class_weight="balanced"))

NameError: name 'preprocessor' is not defined

### One-off validation

In [39]:
# Split the data into training and testing sets

pipe = pipe_rccv_weighted

X_train, X_test, y_train, y_test = train_test_split(train_pp, y, test_size=0.2, random_state=42)

# Fit the pipeline to your training data
pipe.fit(X_train, y_train)

# Make predictions on the test set
y_hat = pipe.predict_proba(X_test)
fpr,tpr,threshold = roc_curve(y_test,y_hat[:,1]) #target 1
auc(fpr,tpr)

np.float64(0.9248266935155015)

### Stratified cross-validation

In [8]:
pipe_list = [pipe_lrcv, pipe_lrcv_weighted, pipe_rccv, pipe_rccv_weighted]
names = ["LR", "LRw", "RC", "RCw"]

In [43]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=52)
splt = skf.split(train_pp, y)

scores = np.zeros((len(pipe_list), n_splits))

for fold, (train_idx, valid_idx) in enumerate(splt):
    print(f'### Fold {fold+1} Training ###')
    X_train = train_pp.loc[train_idx, :]
    y_train = y[train_idx]
    X_valid = train_pp.loc[valid_idx, :]
    y_valid = y[valid_idx]

    for i, pipe in enumerate(pipe_list):
        print(f'Predicting with model {names[i]} ###')
        pipe.fit(X_train, y_train)
        y_hat = pipe.predict_proba(X_valid)
        fpr,tpr,threshold = roc_curve(y_valid, y_hat[:,1]) #target 1
        score = auc(fpr,tpr)
        print(f'Score: {score}')
        scores[i, fold] = score

print(f'Scores: {scores}')

### Fold 1 Training ###
Predicting with model LR ###




Score: 0.916582820044373
Predicting with model LRw ###




Score: 0.8774970488489171
Predicting with model RC ###
Score: 0.9086729017360449
Predicting with model RCw ###
Score: 0.9163596457547178
### Fold 2 Training ###
Predicting with model LR ###




Score: 0.9192853779467606
Predicting with model LRw ###




Score: 0.8758986649427063
Predicting with model RC ###
Score: 0.9114246901365773
Predicting with model RCw ###
Score: 0.9200194302715773
### Fold 3 Training ###
Predicting with model LR ###




Score: 0.9168004581353248
Predicting with model LRw ###




Score: 0.8800310205714428
Predicting with model RC ###
Score: 0.9075686384463402
Predicting with model RCw ###
Score: 0.9152129382749562
### Fold 4 Training ###
Predicting with model LR ###




Score: 0.9191977214672951
Predicting with model LRw ###




Score: 0.8750894572322698
Predicting with model RC ###
Score: 0.9125419605167835
Predicting with model RCw ###
Score: 0.9191877205922185
### Fold 5 Training ###
Predicting with model LR ###




Score: 0.9164063165050751
Predicting with model LRw ###




Score: 0.8770549863589254
Predicting with model RC ###
Score: 0.9086760270095062
Predicting with model RCw ###
Score: 0.9157878695338224
Scores: [[0.91658282 0.91928538 0.91680046 0.91919772 0.91640632]
 [0.87749705 0.87589866 0.88003102 0.87508946 0.87705499]
 [0.9086729  0.91142469 0.90756864 0.91254196 0.90867603]
 [0.91635965 0.92001943 0.91521294 0.91918772 0.91578787]]


### Training on full data

In [17]:
test = pd.read_csv("data/synth/test.csv")
test_pp = transform_variables(test)

In [10]:
for i, pipe in enumerate(pipe_list):
    print(f'Predicting with model {names[i]} ###')
    pipe.fit(train_pp, y)
    y_hat = pipe.predict_proba(test_pp)
    submission = pd.DataFrame({"id": test.id, "loan_status": y_hat[:,1]})
    submission.to_csv(f"data/submissions/{names[i]}.csv", index=False)

Predicting with model LR ###




Predicting with model LRw ###




Predicting with model RC ###
Predicting with model RCw ###


### Creating an ensemble

In [18]:
ensemble = []

for i, name in enumerate(names):
    df = pd.read_csv(f"data/submissions/{name}.csv", index_col="id")
    ensemble.append(df)

In [28]:
df_concat = pd.concat(ensemble, axis=1)
df_ens = df_concat.mean(axis=1)

In [35]:
y_hat_test = pipe.predict_proba(test_pp)
submission = pd.DataFrame({"id": df_ens.index, "loan_status": df_ens.values})
submission.to_csv("data/submissions/ensemble.csv", index=False)

## Using GridCVSearch for PAC, Perceptron, SGD and co.

In [71]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier, Perceptron, SGDClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV

### Grid search for PAC

In [29]:
from sklearn.utils.extmath import softmax
class PassiveAggressiveClassifierwithProba(PassiveAggressiveClassifier):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

In [30]:
params_grid_pac = {
    'C': [1e-5, 1e-3, 0.1, 1, 10, 25, 100, 500, 1000],
    'loss': ['hinge', 'squared_hinge'],
    'class_weight': [None, 'balanced']
}
grid_pac = GridSearchCV(PassiveAggressiveClassifierwithProba(), n_jobs=-1, param_grid=params_grid_pac, cv=5, refit=True)
pipe_grid_pac = make_pipeline(preprocessor, PolynomialFeatures(2), grid_pac)

In [31]:
pipe_grid_pac.fit(train_pp, y)

In [66]:
def extract_cv_scores(pipe):
    best_combo_idx = pipe["gridsearchcv"].cv_results_["rank_test_score"].argmin()
    best_cv_score_mean = pipe["gridsearchcv"].cv_results_["mean_test_score"][best_combo_idx]
    best_cv_score_std = pipe["gridsearchcv"].cv_results_["std_test_score"][best_combo_idx]
    return {"mean":best_cv_score_mean, "std": best_cv_score_std}

In [67]:
extract_cv_scores(pipe_grid_pac)

{'mean': np.float64(0.9291329184073664),
 'std': np.float64(0.0017550883913283015)}

### Grid Search for Perceptron

In [63]:
class PerseptronwithProba(Perceptron):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

In [64]:
params_grid_per = {
    'penalty': ['l2', 'l1', 'elasticnet', None],
    'alpha': [1e-5, 1e-3, 0.1, 1, 10, 25, 100, 500, 1000],
    'l1_ratio': [0, 0.01, 0.1, 0.25, 0.5, 0.75, 0.99, 1],
    'class_weight': [None, 'balanced']
}
grid_per = GridSearchCV(PerseptronwithProba(), n_jobs=-1, param_grid=params_grid_per, cv=5, refit=True)
pipe_grid_per = make_pipeline(preprocessor, PolynomialFeatures(2), grid_per)

In [65]:
pipe_grid_per.fit(train_pp, y)

In [70]:
pipe_grid_per["gridsearchcv"].best_estimator_

In [68]:
extract_cv_scores(pipe_grid_per)

{'mean': np.float64(0.9052604655128315),
 'std': np.float64(0.0049811550587862755)}

### Grid Search for SGDClassifier

In [78]:
params_grid_sgd = {
    'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron',
             'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet', None],
    'alpha': [1e-5, 1e-3, 0.1, 1, 10, 25, 100, 500, 1000],
    'l1_ratio': [0, 0.01, 0.1, 0.25, 0.5, 0.75, 0.99, 1],
    'class_weight': [None, 'balanced']
}
grid_sgd = GridSearchCV(SGDClassifier(), n_jobs=-1, param_grid=params_grid_sgd, cv=5, refit=True)
pipe_grid_sgd = make_pipeline(preprocessor, PolynomialFeatures(2), grid_sgd)

In [79]:
pipe_grid_sgd.fit(train_pp, y)

In [80]:
pipe_grid_sgd["gridsearchcv"].best_estimator_

In [96]:
extract_cv_scores(SGDClassifier)

{'mean': np.float64(0.9312473356637394),
 'std': np.float64(0.001364779456305825)}

In [98]:
pipe_grid_sgd.predict(train_pp)

array([0, 0, 0, ..., 0, 0, 0])

### Grid search for KNN

In [90]:
from sklearn.neighbors import KNeighborsClassifier

params_grid_knn = {
    'n_neighbors': [1, 5, 10, 25, 50, 100], 
    'leaf_size': [1, 5, 10, 25, 50, 100], 
    'p': [1, 2],
    'weights':['uniform', 'distance']
}
grid_knn = GridSearchCV(KNeighborsClassifier(), n_jobs=-1, param_grid=params_grid_knn, cv=5, refit=True)
pipe_grid_knn = make_pipeline(preprocessor, PolynomialFeatures(2), grid_knn)

In [91]:
pipe_grid_knn.fit(train_pp, y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [92]:
pipe_grid_knn["gridsearchcv"].best_estimator_

In [93]:
extract_cv_scores(pipe_grid_knn)

{'mean': np.float64(0.9374200699121834),
 'std': np.float64(0.001679402813845358)}

## Prediction

In [99]:
pipe_list = [ pipe_grid_pac]
names = ["pac"]

for i, pipe in enumerate(pipe_list):
    print(f'Predicting with model {names[i]} ###')
    #pipe.fit(train_pp, y)
    y_hat = pipe.predict_proba(test_pp)
    submission = pd.DataFrame({"id": test.id, "loan_status": y_hat[:,1]})
    submission.to_csv(f"data/submissions/{names[i]}.csv", index=False)

Predicting with model pac ###
