## Pipeline

Поработаем с датасетом про кредиты: нам нужно решить, давать кредит человеку или нет. Попробуем отмасштабировать данные и заодно собрать все в пайплайн, чтобы было удобнее.

# 1. Импортируем нужные библиотеки

In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# 2. Загружаем наш датасет

In [49]:
loan_sanction_df = pd.read_csv('loan_sanction_train.csv')
loan_sanction_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# 3. Оценка данных

In [50]:
# выведем размерность нашего датасета:
loan_sanction_df.shape

(614, 13)

In [51]:
# выведем типы данных:
loan_sanction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [52]:
loan_sanction_df.Gender.describe()

count      601
unique       2
top       Male
freq       489
Name: Gender, dtype: object

In [53]:
# выведем основные статистические показатели:
loan_sanction_df.describe().round(3)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459,1621.246,146.412,342.0,0.842
std,6109.042,2926.248,85.587,65.12,0.365
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


# 4. Feature Engineering:

## 4.1. Удалим ненужные столбцы

In [54]:
# Удалим столбцы Loan_ID, Married:
loan_sanction_df.drop(['Loan_ID', 'Married'], axis=1, inplace=True)
loan_sanction_df

Unnamed: 0,Gender,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...
609,Female,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


## 4.2. Обработка категориальных признаков и создание новых признаков из уже существующих

In [55]:
print('Gender: ', loan_sanction_df.Gender.unique(), '\n',
      'Dependents: ', loan_sanction_df.Dependents.unique(), '\n',
      'Education: ', loan_sanction_df.Education.unique(), '\n',
      'Self_Employed: ', loan_sanction_df.Self_Employed.unique(), '\n',
      'Loan_Amount_Term: ', loan_sanction_df.Loan_Amount_Term.unique(), '\n',
      'Credit_History: ', loan_sanction_df.Credit_History.unique(), '\n',
      'Property_Area: ', loan_sanction_df.Property_Area.unique(), '\n',
      'Loan_Status: ', loan_sanction_df.Loan_Status.unique()
      )

Gender:  ['Male' 'Female' nan] 
 Dependents:  ['0' '1' '2' '3+' nan] 
 Education:  ['Graduate' 'Not Graduate'] 
 Self_Employed:  ['No' 'Yes' nan] 
 Loan_Amount_Term:  [360. 120. 240.  nan 180.  60. 300. 480.  36.  84.  12.] 
 Credit_History:  [ 1.  0. nan] 
 Property_Area:  ['Urban' 'Rural' 'Semiurban'] 
 Loan_Status:  ['Y' 'N']


In [44]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

def LabelEncoder (data, feature):
  data[feature] = label_encoder.fit_transform(data[feature])

  return(data)

In [56]:
from sklearn.preprocessing import OneHotEncoder

def One_Hot_Encoder (data, feature):
  encoded_columns = pd.get_dummies(data[feature])
  return(encoded_columns)

In [57]:
def change_type(data, feature):
  data[feature] = data[feature].astype(int)
  return(data)

In [58]:
# Поработаем с признаком 'Gender' (пол):
loan_sanction_df['Gender'] = loan_sanction_df['Gender'].fillna('Male') # заполним пустые значения 'Male'
loan_sanction_df = loan_sanction_df.join(One_Hot_Encoder(loan_sanction_df, 'Gender')) # применим кодирование OneHotEncoder
loan_sanction_df.rename(columns = {0:'female', 1: 'male'}, inplace=True)
# change_type(loan_sanction_df, 'Female') # переведём значения в числовой вид (x -> int(x))
# change_type(loan_sanction_df, 'Male')


# Поработаем с признаком 'Education' (образование):
loan_sanction_df.Education.replace(['Not Graduate', 'Graduate'], [0, 1], inplace=True) # применим кодирование через метод replace()


# Поработаем с признаком 'Dependents' (виды занятости):
def map_stay(value):
    if value == '3+':
      return 3
    else:
      return int(value)

loan_sanction_df['Dependents'] = loan_sanction_df['Dependents'].fillna('4') # заполним пустые значения '4'
loan_sanction_df['Dependents'] = loan_sanction_df['Dependents'].apply(map_stay) # заменим '3+' на '3', а все остальные значения переведём в числовой вид (x -> int(x))


# Поработаем с признаком 'Self_Employed' (самозанятость):
loan_sanction_df['Self_Employed'] = loan_sanction_df['Self_Employed'].fillna('S') # заполним пустые значения 'S'
loan_sanction_df.Self_Employed.replace(['No', 'Yes', 'S'], [0, 1, 2], inplace=True) # применим кодирование через метод replace()


# Поработаем с признаком 'LoanAmount' (величина кредита):
loan_sanction_df['LoanAmount'] = loan_sanction_df['LoanAmount'].fillna(0) # заполним пустые значения '0'

# Поработаем с признаком 'Loan_Amount_Term' (кредит_сумма_срок):
loan_sanction_df['Loan_Amount_Term'] = loan_sanction_df['Loan_Amount_Term'].fillna(0) # заполним пустые значения '0'



# Поработаем с признаком 'Credit_History' (кредитная история):
loan_sanction_df['Credit_History'] = loan_sanction_df['Credit_History'].fillna(2) # заполним пустые значения '2'


# Поработаем с признаком 'Property_Area' (площадь недвижимости):
loan_sanction_df = loan_sanction_df.join(One_Hot_Encoder(loan_sanction_df, 'Property_Area')) # применим кодирование OneHotEncoder
change_type(loan_sanction_df, 'Rural')
change_type(loan_sanction_df, 'Semiurban')
change_type(loan_sanction_df, 'Urban')


# Поработаем с признаком 'Loan_Status' (статус кредита):
loan_sanction_df.Loan_Status.replace(['N', 'Y'], [0, 1], inplace=True) # применим кодирование через метод replace()



drop_elements = ['Gender', 'Property_Area']

loan_sanction_df = loan_sanction_df.drop(drop_elements, axis = 1)

loan_sanction_df

Unnamed: 0,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Female,Male,Rural,Semiurban,Urban
0,0,1,0,5849,0.0,0.0,360.0,1.0,1,0,1,0,0,1
1,1,1,0,4583,1508.0,128.0,360.0,1.0,0,0,1,1,0,0
2,0,1,1,3000,0.0,66.0,360.0,1.0,1,0,1,0,0,1
3,0,0,0,2583,2358.0,120.0,360.0,1.0,1,0,1,0,0,1
4,0,1,0,6000,0.0,141.0,360.0,1.0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,1,0,2900,0.0,71.0,360.0,1.0,1,1,0,1,0,0
610,3,1,0,4106,0.0,40.0,180.0,1.0,1,0,1,1,0,0
611,1,1,0,8072,240.0,253.0,360.0,1.0,1,0,1,0,0,1
612,2,1,0,7583,0.0,187.0,360.0,1.0,1,0,1,0,0,1


In [59]:
loan_sanction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dependents         614 non-null    int64  
 1   Education          614 non-null    int64  
 2   Self_Employed      614 non-null    int64  
 3   ApplicantIncome    614 non-null    int64  
 4   CoapplicantIncome  614 non-null    float64
 5   LoanAmount         614 non-null    float64
 6   Loan_Amount_Term   614 non-null    float64
 7   Credit_History     614 non-null    float64
 8   Loan_Status        614 non-null    int64  
 9   Female             614 non-null    uint8  
 10  Male               614 non-null    uint8  
 11  Rural              614 non-null    int64  
 12  Semiurban          614 non-null    int64  
 13  Urban              614 non-null    int64  
dtypes: float64(4), int64(8), uint8(2)
memory usage: 58.9 KB


# 5. Построим модель Логистической регрессии


In [60]:
X = loan_sanction_df.drop('Loan_Status', axis=1)
y = loan_sanction_df.Loan_Status
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [62]:
# Проверим распределение классов:
y.value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

Соберем пайплайн: комбайн, который будет внутри себя сразу гонять и масштабирование, и модель

In [63]:
pipe = Pipeline([('scaler', StandardScaler()), ('model', LogisticRegression())])

Альтернативный вариант:

pipe = make_pipeline(StandardScaler(), LogisticRegression())

В чем между ними разница? Во-первых, второе - упрощенный синтаксис, вы не прописываете вручную ярлычки для своих шагов пайплайна. Во-вторых, получается, эти ярлычки приписываются автоматически (по правилу: название класса строчными буквами, например, у StandardScaler автоматически будет ярлычок standardscaler). Это сакральное знание пригодится, когда будем гридсерчить параметры.

In [64]:
pipe.fit(X_train, y_train)
ypred_train = pipe.predict(X_train)
ypred_test = pipe.predict(X_test)
print(classification_report(ypred_train, y_train), classification_report(ypred_test, y_test))

              precision    recall  f1-score   support

           0       0.49      0.93      0.64        74
           1       0.98      0.81      0.89       386

    accuracy                           0.83       460
   macro avg       0.74      0.87      0.76       460
weighted avg       0.90      0.83      0.85       460
               precision    recall  f1-score   support

           0       0.30      0.83      0.44        18
           1       0.97      0.74      0.84       136

    accuracy                           0.75       154
   macro avg       0.64      0.79      0.64       154
weighted avg       0.89      0.75      0.79       154



In [70]:
pipe2 = Pipeline([('scaler', StandardScaler()), ('model', SVC())])

In [71]:
pipe2.fit(X_train, y_train)
ypred_train = pipe2.predict(X_train)
ypred_test = pipe2.predict(X_test)
print(classification_report(ypred_train, y_train), classification_report(ypred_test, y_test))

              precision    recall  f1-score   support

           0       0.51      0.94      0.66        78
           1       0.98      0.82      0.89       382

    accuracy                           0.84       460
   macro avg       0.75      0.88      0.78       460
weighted avg       0.90      0.84      0.86       460
               precision    recall  f1-score   support

           0       0.26      0.93      0.41        14
           1       0.99      0.74      0.84       140

    accuracy                           0.75       154
   macro avg       0.63      0.83      0.63       154
weighted avg       0.92      0.75      0.80       154

