## Multi-layer Perceptron Classifier

In [3]:
import sys
import numpy as np
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder,PowerTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
import itertools

In [4]:
print('Python       :', sys.version.split('\n')[0])
print('Numpy        :', np.__version__)
print('Pandas       :', pd.__version__)
print('Sklearn      :', sklearn.__version__)

Python       : 3.7.10 | packaged by conda-forge | (default, Feb 19 2021, 16:07:37) 
Numpy        : 1.19.5
Pandas       : 1.2.3
Sklearn      : 0.24.1


In [5]:
np.random.seed(0)

In [6]:
!pip freeze > requirements.txt

In [7]:
class AutoMLPClassifier:
    
    def __init__(self, 
        scoring_function = 'accuracy', 
        n_iter = 50):
        self.scoring_function = scoring_function
        self.n_iter = n_iter
        
    def fit(self,X,y):
        X_train = X
        y_train = y

        categorical_values = []

        cat_subset = X_train.select_dtypes(include = ['object','category','bool'])

        for i in range(cat_subset.shape[1]):
            categorical_values.append(list(cat_subset.iloc[:,i].dropna().unique()))
        
        num_pipeline = Pipeline([
            ('cleaner',SimpleImputer()),
            ('scaler',StandardScaler())
            ])

        cat_pipeline = Pipeline([
            ('cleaner',SimpleImputer(strategy = 'most_frequent')),
            ('encoder',OneHotEncoder(sparse = False, categories=categorical_values))
            ])


        preprocessor = ColumnTransformer([
            ('numerical', num_pipeline, make_column_selector(dtype_exclude=['object','category','bool'])),
            ('categorical', cat_pipeline, make_column_selector(dtype_include=['object','category','bool']))
            ])
            
        model_pipeline_steps = []
        model_pipeline_steps.append(('preprocessor',preprocessor))
        model_pipeline_steps.append(('feature_selector',SelectKBest(f_classif,k='all')))
        model_pipeline_steps.append(('estimator', MLPClassifier()))
        model_pipeline = Pipeline(model_pipeline_steps)

        total_features = preprocessor.fit_transform(X_train).shape[1]

        optimization_grid = []
        
        optimization_grid.append({
        'preprocessor__numerical__scaler':[RobustScaler(),StandardScaler(),MinMaxScaler()],
        'preprocessor__numerical__cleaner__strategy':['mean','median'],
        'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
        'estimator' : [MLPClassifier(random_state = 0)],
        'estimator__hidden_layer_sizes' : [x for x in itertools.product((1, 5, 10, 20, 30, 40, 50, 100),repeat=3)],
        'estimator__max_iter': np.arange(1000, 10000000, 10000),
        'estimator__activation' : ['identity', 'logistic', 'tanh', 'relu'],
        'estimator__alpha' : np.linspace(0.0000001, 0.01, 100)
        })

        search = RandomizedSearchCV(
                          model_pipeline,
                          optimization_grid,
                          n_iter=self.n_iter,
                          scoring = self.scoring_function, 
                          n_jobs = -1, 
                          random_state = 0, 
                          verbose = 3,
                          cv = 5
                            )

        search.fit(X_train, y_train)
        
        self.best_estimator_ = search.best_estimator_
        self.best_pipeline = search.best_params_
        self.best_score = search.best_score_
        self.cv_results = search.cv_results_
        
    def predict(self,X,y = None):
        return self.best_estimator_.predict(X)

    def predict_proba(self,X,y = None):
        return self.best_estimator_.predict_proba(X)

In [8]:
DATA_DIR = '../input/findata-creditscoring/'
train = pd.read_csv(DATA_DIR + 'credit_train.csv')
test = pd.read_csv(DATA_DIR + 'credit_test.csv')
sample_submission = pd.read_csv(DATA_DIR + 'credit__sample.csv')
train.drop_duplicates(keep='first', inplace = True)
train['Loan Status'] = train['Loan Status'].map({"Fully Paid": 1, "Charged Off": 0})
loan_status_1 = test[test['Current Loan Amount'] == 99999999]['Loan ID']
loan_status_0 = test[test['Credit Score'] > 800]['Loan ID']
train.drop(['Loan ID','Customer ID',], axis = 1, inplace=True)
test.drop(['Loan ID','Customer ID',], axis = 1, inplace=True)

In [9]:
X = train.drop(['Loan Status'], axis=1)
y = train['Loan Status'].values  

In [10]:
model = AutoMLPClassifier()
model.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [11]:
model.best_pipeline

{'preprocessor__numerical__scaler': RobustScaler(),
 'preprocessor__numerical__cleaner__strategy': 'median',
 'feature_selector__k': 31,
 'estimator__max_iter': 5521000,
 'estimator__hidden_layer_sizes': (40, 30, 5),
 'estimator__alpha': 0.005353581818181819,
 'estimator__activation': 'logistic',
 'estimator': MLPClassifier(activation='logistic', alpha=0.005353581818181819,
               hidden_layer_sizes=(40, 30, 5), max_iter=5521000, random_state=0)}

In [12]:
model.best_score

0.7989976053906555

### Выводы

1. На тренировочных данных обучен многослоевой перцептрон.
2. Построен пайплайн, включающий заполнение пустых значений признаков, one-hot-кодирование категориальных признаков и стандартизацию числовых признаков, отбор значимых признаков и непосредственно обучение модели. 
3. Подбор гиперпараметров осуществлен с помощью случайного решетчатого поиска с кросс-валидацией на 5 фолдах. 
4. Значение метрики не превзошло достигнутого ранее результата с помощью "случайного леса". 