In [1]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, learning_curve, cross_val_predict
from sklearn.metrics import precision_recall_curve

In [3]:
poly=pd.read_csv('poly_new.csv')
y=poly['target']
train=poly.drop('target',axis=1)
std_=StandardScaler()
train_data=std_.fit_transform(train)
train.head()

Unnamed: 0,NumberOfTime30-59DaysPastDueNotWorse,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,1,RevolvingUtilizationOfUnsecuredLines,DebtRatio,MonthlyIncome,...,DebtRatio^3,DebtRatio^2 MonthlyIncome,DebtRatio^2 age,DebtRatio MonthlyIncome^2,DebtRatio MonthlyIncome age,DebtRatio age^2,MonthlyIncome^3,MonthlyIncome^2 age,MonthlyIncome age^2,age^3
0,2.0,13.0,0.0,6.0,0.0,2.0,1.0,0.766127,0.802982,9120.0,...,0.517747,5880.396331,29.015113,66787560.0,329543.865742,1626.038811,758550500000.0,3742848000.0,18468000.0,91125.0
1,0.0,4.0,0.0,0.0,0.0,1.0,1.0,0.957151,0.121876,2600.0,...,0.00181,38.619902,0.594152,823883.1,12675.124904,195.001922,17576000000.0,270400000.0,4160000.0,64000.0
2,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.65818,0.085113,3042.0,...,0.000617,22.03712,0.275283,787619.1,9838.765697,122.903713,28149950000.0,351643000.0,4392648.0,54872.0
3,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.23381,0.03605,3300.0,...,4.7e-05,4.288613,0.038987,392581.0,3568.918518,32.444714,35937000000.0,326700000.0,2970000.0,27000.0
4,1.0,7.0,0.0,1.0,0.0,0.0,1.0,0.907239,0.024926,6107.322992,...,1.5e-05,3.79442,0.030443,929713.3,7459.234238,59.846594,227799400000.0,1827670000.0,14663680.0,117649.0


In [None]:
rd=RandomForestClassifier(random_state=42,n_jobs=-1,n_estimators=300)
rd.fit(train_data,y)
columns=list(train)
fea_imp=pd.DataFrame({'features':columns,'Feature_importance':rd.feature_importances_})
fea_imp=fea_imp.sort_values('Feature_importance',ascending=False).reset_index(drop=True)
fea_imp.head(20)

In [None]:
fe_data=train[list(fea_imp.features)[:20]]
fe_data.to_csv('fe_data.csv')

In [None]:
fe=std_.fit_transform(fe_data)
lg=LogisticRegression(random_state=42,solver='newton-cg')
best_clf=GridSearchCV(lg,param_grid={'C':[0.001,0.01,0.1,1]},
                      n_jobs=-1,scoring='roc_auc')
best_clf.fit(fe,y)
print(best_clf.best_params_,best_clf.best_score_)

In [None]:
lg=LogisticRegression(random_state=42)
best_clf=GridSearchCV(lg,param_grid={'C':[0.001,0.01,0.1,1]},
                      n_jobs=-1,scoring='roc_auc')
best_clf.fit(train_data,y)
print(best_clf.best_params_,best_clf.best_score_)

In [None]:
lg=LogisticRegression(random_state=42)
train_size,train_score,test_score=learning_curve(lg,train_data,y,train_sizes=np.linspace(0.1,1,20),cv=5,
                                                scoring='roc_auc',n_jobs=-1)

plt.figure(dpi=300,figsize=(8,4))
plt.plot(train_size,np.mean(train_score,axis=1),color='blue',label='train score')
plt.plot(train_size,np.mean(test_score,axis=1),color='red',label='test score')
plt.legend()
plt.title('Learning Curve')

In [None]:
y_scores = cross_val_predict(lg, train_data, y, cv=3,
                             method="decision_function")
if y_scores.ndim == 2:
    y_scores = y_scores[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y, y_scores)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper right", fontsize=16)
    plt.title('Precision and Recall curve under model Logistic Regression')
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4),dpi=300)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.ylim([0, 1])
plt.show()

In [None]:
rd=RandomForestClassifier(random_state=42,n_estimators=300,n_jobs=-1)
y_scores = cross_val_predict(rd, train_data, y, cv=3,
                             method="predict")
if y_scores.ndim == 2:
    y_scores = y_scores[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y, y_scores)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper right", fontsize=16)
    plt.title('Precision and Recall curve under model Random Forest')
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4),dpi=300)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.ylim([0, 1])
plt.show()

In [4]:
fe_data=pd.read_csv('fe_data.csv',index_col=False)
fe_train=fe_data.iloc[0:20000,1:]
fe_y=y[:20000]

In [5]:
fe_train['y']=fe_y
fe_train_pos=fe_train.loc[fe_train.y==1,]
fe_train_neg=fe_train.loc[fe_train.y==0,]
fe_train_pos=fe_train_pos.sample(6000,replace=True)
print(fe_train_pos.shape)
print(fe_train_neg.shape)

(6000, 21)
(18716, 21)


In [6]:
fe_train=pd.concat([fe_train_pos,fe_train_neg],axis=0)
print(fe_train.shape)
y=fe_train['y']
fe_train=fe_train.drop('y',axis=1)
print(fe_train.shape)
print(len(y))
std_=StandardScaler()
fe_train=std_.fit_transform(fe_train)

(24716, 21)
(24716, 20)
24716


In [16]:
clf=MLPClassifier(random_state=42)
par={'alpha':[0.001,0.01,0.1,1],'activation':['logistic','tanh','relu'],
    'solver':['lbfgs','adam'],'hidden_layer_sizes':[(50,20),(40,30),(100,)]}
best_clf=GridSearchCV(clf,cv=5,param_grid=par,n_jobs=-1,scoring='roc_auc')
best_clf.fit(fe_train,y)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


GridSearchCV(cv=5, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state...e,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'activation': ['

In [19]:
best_clf.best_estimator_

MLPClassifier(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(40, 30), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [17]:
best_clf.best_score_

0.9261560358895009

In [18]:
np.mean(best_clf.cv_results_['mean_test_score'])

0.8832637962734484

In [20]:
clf=MLPClassifier(random_state=42,alpha=0.01,activation='tanh',
                  solver='lbfgs',max_iter=300,hidden_layer_sizes=(40,30))
clf.fit(fe_train,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(activation='tanh', alpha=0.01, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(40, 30), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)