In [11]:
'''import the required packages and read the file.'''

import pandas as pd
import numpy as np


print('reading file')

data = pd.read_csv('input_file_1_subs.csv', sep = ',', index_col=0)

print('file shape', data.shape)

'''parse column to date format'''

print('date encoding')

data['issue_d'] = pd.to_datetime(data['issue_d'])

'''check for and remove datapoints with null values.'''

print(data['issue_d'].isnull().any(), data['purpose'].isnull().any())

print('remove null datapoints to see if it helps...')

data = data.loc[data['purpose'].isnull() == False]

'''eliminate purpose categories with low count.'''

print('eliminating small count categories')

threshold = 190

counts = data['purpose'].value_counts()

keep_list = counts[counts > threshold].index

data = data[data['purpose'].isin(keep_list)]

'''replace the existing labels so that they can be called easily from pandas and TensorFlow'''

print('replacing labels')

to_replace = {
    'Debt consolidation': 'debt_consolidation',
    'Home improvement': 'home_improvement',
    'Credit card refinancing': 'credit_card',
    'Other': 'other',
    'Vacation': 'vacation',
    'Medical expenses': 'medical',
    'Car financing': 'car',
    'Major purchase': 'major_purchase',
    'Moving and relocation': 'moving',
    'Home buying': 'house'
}

data['purpose'] = data['purpose'].replace(to_replace)

print(data['purpose'].value_counts())

'''Create one-hot encoded dummy columns for categorical variables.'''

print('hot encoding')

data = pd.get_dummies(data, columns=['purpose'], drop_first=False)

print('data columns AFTER hot encoding      ', data.columns)

'''split training and test data by date quantile.'''

data_train = data.loc[data['issue_d'] < data['issue_d'].quantile(0.9)]
data_test = data.loc[data['issue_d'] >= data['issue_d'].quantile(0.9)]

print('Number of loans in the partition:   ', data_train.shape[0] + data_test.shape[0])
print('Number of loans in the full dataset:', data.shape[0])

'''Drop the date column as not needed for the model.'''

data_train.drop('issue_d', axis=1, inplace=True)
data_test.drop('issue_d', axis=1, inplace=True)

'''Split features and labels'''

y_train = data_train['rejected']
y_test = data_test['rejected']
X_train = data_train.drop('rejected', axis=1)
X_test = data_test.drop('rejected', axis=1)

# Check if y_train contains more than one unique class label
if len(np.unique(y_train)) > 1:
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, PolynomialFeatures
    from sklearn.impute import SimpleImputer
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import SGDClassifier

    '''Build a pipeline for preprocessing and training'''
    pipeline_sgdlogreg = Pipeline([
        ('imputer',  SimpleImputer(copy=False)), # Mean imputation by default
        ('scaler', StandardScaler(copy=False)),
        ('model', SGDClassifier(
            class_weight='balanced',
            loss='log',
            max_iter=10,
            tol = 1e-3,
            random_state=1,
            n_jobs=10,
            warm_start=True
            )
        )
    ])

    param_grid_sgdlogreg = {
        'model__alpha': [10**-3, 10**-2, 10**1],
        'model__penalty': ['l1', 'l2']
    }

    '''Set up a grid search.'''
    grid_sgdlogreg = GridSearchCV(
        estimator=pipeline_sgdlogreg,
        param_grid=param_grid_sgdlogreg,
        scoring='roc_auc',
        pre_dispatch=3,
        n_jobs=5,
        cv=5,
        verbose=5,
        return_train_score=False
    )

    '''Fit the model.'''
    print('fitting')
    grid_sgdlogreg.fit(X_train, y_train)

    '''Print model parameters, best parameters and best score.'''
    print('parameters       ', grid_sgdlogreg.get_params())
    print(grid_sgdlogreg.best_params_, grid_sgdlogreg.best_score_)

    from sklearn.metrics import roc_auc_score, recall_score

    '''Make predictions on test dataset.'''
    y_score = grid_sgdlogreg.predict_proba(X_test)[:,1]
    y_score_flag = [int(round(i)) for i in y_score]

    print('LOOK FOR DISCREPANCIES HERE...')
    print(roc_auc_score(y_test, y_score), recall_score(y_test, y_score_flag, pos_label=1), recall_score(y_test, y_score_flag, pos_label=0))

    y_score_flag = grid_sgdlogreg.predict(X_test)
else:
    print("y_train contains only one unique class label. Unable to fit the model.")

reading file
file shape (55515, 9)
date encoding
False False
remove null datapoints to see if it helps...
eliminating small count categories
replacing labels
debt_consolidation    26255
other                  9130
credit_card            6020
home_improvement       3159
major_purchase         2145
car                    2126
moving                 1594
medical                1443
small_business         1002
vacation                883
house                   869
Name: purpose, dtype: int64
hot encoding
data columns AFTER hot encoding       Index(['dti', 'emp_length', 'issue_d', 'loan_amnt', 'rejected', 'Unnamed: 7',
       'Unnamed: 8', 'Unnamed: 9', 'purpose_car', 'purpose_credit_card',
       'purpose_debt_consolidation', 'purpose_home_improvement',
       'purpose_house', 'purpose_major_purchase', 'purpose_medical',
       'purpose_moving', 'purpose_other', 'purpose_small_business',
       'purpose_vacation'],
      dtype='object')
Number of loans in the partition:    54626
Number of

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train.drop('issue_d', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test.drop('issue_d', axis=1, inplace=True)


parameters        {'cv': 5, 'error_score': nan, 'estimator__memory': None, 'estimator__steps': [('imputer', SimpleImputer(copy=False)), ('scaler', StandardScaler(copy=False)), ('model', SGDClassifier(class_weight='balanced', loss='log', max_iter=10, n_jobs=10,
              random_state=1, warm_start=True))], 'estimator__verbose': False, 'estimator__imputer': SimpleImputer(copy=False), 'estimator__scaler': StandardScaler(copy=False), 'estimator__model': SGDClassifier(class_weight='balanced', loss='log', max_iter=10, n_jobs=10,
              random_state=1, warm_start=True), 'estimator__imputer__add_indicator': False, 'estimator__imputer__copy': False, 'estimator__imputer__fill_value': None, 'estimator__imputer__keep_empty_features': False, 'estimator__imputer__missing_values': nan, 'estimator__imputer__strategy': 'mean', 'estimator__imputer__verbose': 'deprecated', 'estimator__scaler__copy': False, 'estimator__scaler__with_mean': True, 'estimator__scaler__with_std': True, 'estimator__m