In [1]:
# Have Data Call import Data
%run import_data.py #import data into dataframe, drop ID column, and clean up PAY_0, PAY_X, MARRIAGE, and EDUCATION columns

In [2]:
#### Step 1) Preprocess Data

# We will train our classifier with the following features:
# Numeric features to be scaled: LIMIT_BAL, AGE, PAY_X, BIL_AMTX, and PAY_AMTX
# Categorical features: SEX, EDUCATION, MARRIAGE

# We create the preprocessing pipelines for both numeric and categorical data
numeric_features = ['LIMIT_BAL', 'AGE', 
                     'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 
                     'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'
                   ]



data['PAY_1'] = data.PAY_1.astype('float64')
data['PAY_2'] = data.PAY_2.astype('float64')
data['PAY_3'] = data.PAY_3.astype('float64')
data['PAY_4'] = data.PAY_4.astype('float64')
data['PAY_5'] = data.PAY_5.astype('float64')
data['PAY_6'] = data.PAY_6.astype('float64')
data['AGE'] = data.AGE.astype('float64')

numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(categories='auto'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        #,('lab', label_transformer, label_features)
    ])

#### Step 2) Split Data into Training and Test Sets

y = data['default']#.values
X = data.drop(['default'], axis=1)#.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

#### Step 3: Instantiate the Estimator

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
from sklearn.linear_model import LogisticRegression
lr = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', LogisticRegression(solver='liblinear'))])

#### Step 4: Specify the Hyperparameter Space

param_grid_lr = {
    
    'classifier__C': np.logspace(-5, 8, 15),
    'classifier__penalty': ['l1', 'l2']
}

#### Step 5: Instantiate the CV Object

lr_cv = GridSearchCV(lr, param_grid_lr, cv=5, iid=False)

#### Step 6: Fit on Training

t0 = time.time()
lr_cv.fit(X_train, y_train)
print("It takes ", time.time() - t0, " seconds for LR fitting")

#### Step 7: Predict on Test

y_pred_lr = lr_cv.predict(X_test)

#### Step 8: Scoring

##### Accuracy

print("Mean cross-validated score of the best estimator: %.3f" % lr_cv.best_score_)
print("Accuracy with LR on testing set is: %.3f" % lr_cv.score(X_test, y_test))
print("Accuracy with LR on training set is: %.3f" % lr_cv.score(X_train, y_train))

y_pred_prob = lr_cv.predict_proba(X_test)[:,1]
print("ROC AUC score is: %.3f" % roc_auc_score(y_test, y_pred_prob))

It takes  67.18677973747253  seconds for LR fitting
Mean cross-validated score of the best estimator: 0.779
Accuracy with LR on testing set is: 0.779
Accuracy with LR on training set is: 0.779
ROC AUC score is: 0.500
