In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!pip install category_encoders

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator

from sklearn.externals import joblib

Cargamos los datos que creamos en el paso anterior

In [0]:
Loans = pd.read_csv('/content/drive/My Drive/TFM/data/loan.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
# Dividimos en df_train y df_test. La opción stratify hace que la proporción del target se mantenga en ambas particiones
Loans_train, Loans_test = train_test_split(Loans, stratify = Loans.loan_status, test_size=.3, random_state=42)

In [0]:
print('Loans.shape = {}'.format(Loans.shape))
print('Loans_train.shape = {}'.format(Loans_train.shape))
print('Loans_test.shape = {}'.format(Loans_test.shape))

Loans.shape = (887379, 74)
Loans_train.shape = (621165, 74)
Loans_test.shape = (266214, 74)


Hacemos el preprocesado creando una clase basada en un Transformer básico de sklearn. Lo hacemos así por si podemos añadirlo, en un momento futuro, al pipeline que usaremos en el siguiente script para hacer un OneHotEncoder y usar un método de predicción.

In [0]:
class LoansTransformer(BaseEstimator):
    
    def __init__(self):
        # guardamos ciertos parámetros como parámetros públicos

        self.del_columns = ['id', 'member_id', 'funded_amnt', 'funded_amnt_inv',
                            'installment', 'grade', 'emp_title', 'pymnt_plan', 'url',
                            'desc', 'zip_code', 'mths_since_last_delinq', 'mths_since_last_record',
                            'out_prncp_inv', 'total_pymnt_inv', 'next_pymnt_d',
                            'mths_since_last_major_derog', 'policy_code', 'annual_inc_joint',
                            'dti_joint', 'verification_status_joint', 'tot_coll_amt',
                            'tot_cur_bal', 'open_acc_6m', 'open_il_6m', 'open_il_12m',
                            'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util',
                            'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
                            'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
                            'title', 'application_type']

        self.categorical_features = ['home_ownership', 'verification_status', 'purpose', 'addr_state']
        
    def fit(self, X):
        # Asumimos que X es un DataFrame
        self._columns = X.columns.values

        # sub_grades
        grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
        sub_grades = ['1', '2', '3', '4', '5']
        cat_sub_grade = [j+i for j in grades for i in sub_grades]
        levels = np.arange(0, len(cat_sub_grade))
        self.dict_grades = dict(zip(cat_sub_grade, levels))

        # issue_d. Hay 103 meses distintos en issue_d. Repensar si se tiene que subir a producción.
        values_issue = np.arange(0, 103)
        months_issue =  pd.date_range(pd.to_datetime('2007-06-01 00:00:00'), pd.to_datetime('2015-12-01 00:00:00'), freq='MS')
        self.dict_issue = dict(zip(months_issue, values_issue))

        # earliest_cr_line: falta por hacer
        months_earl =  pd.date_range(pd.to_datetime('1944-01-01 00:00:00'), pd.to_datetime('2012-11-01 00:00:00'), freq='MS')
        values_earl = np.arange(0,len(months_earl))
        self.dict_earliest = dict(zip(months_earl, values_earl))

        # last_payment
        months_last =  pd.date_range(pd.to_datetime('2007-12-01 00:00:00'), pd.to_datetime('2016-01-01 00:00:00'), freq='MS')
        values_last = np.arange(0,len(months_last))
        self.dict_last = dict(zip(months_last, values_last))

        # last_credit
        months_credit =  pd.date_range(pd.to_datetime('2007-05-01 00:00:00'), pd.to_datetime('2016-01-01 00:00:00'), freq='MS')
        values_credit = np.arange(0,len(months_credit))
        self.dict_credit = dict(zip(months_credit, values_credit))

        # One Hot Encoder
        # Hacemos un OrdinalEncoder para hacer luego el diccionario para Lime
        # self.le =  ce.OneHotEncoder(cols = self.categorical_features, return_df=True, handle_unknown="ignore")
        self.le =  ce.OrdinalEncoder(cols = self.categorical_features, return_df=True, handle_unknown="ignore")
        self.le.fit(X)

        return self
        
    def transform(self, X):
        # Comprobamos que tiene las mismas columnas que el DataFrame con el que hicimos el fit
        if set(self._columns) != set(X.columns):
            raise ValueError('Las columnas de este DataFrame son distintas de las que se hicieron en el fit')
        elif len(self._columns) != len(X.columns):
            raise ValueError('El número de columnas de este DataFrame es distinto del número con el que se hizo el fit')
        
        # One Hot Encoder. Ojo: tenemos que hacerlo antes de eliminar columnas
        # df = self.le.transform(X)

        # Eliminamos columnas no útiles y filas con na
        df = X.drop(self.del_columns, axis=1)
        df = df.dropna()

        # term
        df.term = X.term.apply(lambda x: 0 if x == ' 36 months' else 1)

        # sub_grade
        # Ya que hay orden, vamos a hacer numéricas las categorías que tenemos
        df.sub_grade = X.sub_grade.apply(lambda x: self.dict_grades[x])

        # emp_length
        df.emp_length = df.emp_length.apply(lambda x: '0 years' if x =='< 1 year' else x)
        df.emp_length = df.emp_length.str.extract(r'(.*\d+)')
        df.emp_length = df.emp_length.apply(int)

        # issue_d
        df.issue_d = pd.to_datetime(df.issue_d, format='%b-%Y')
        df.issue_d = df.issue_d.apply(lambda x: self.dict_issue[x])

        # loan_status
        df.loan_status = df.loan_status.apply(lambda x: 'Fully Paid' if x == 'Current' or x == 'Fully Paid' else 'Default')
        df.loan_status = df.loan_status.apply(lambda x: 0 if x == 'Fully Paid' else 1)

        # dti
        df = df[df.dti < 300]

        # earliest_cr_line: Pasamos las fechas a una variable continua
        df.earliest_cr_line = pd.to_datetime(df.earliest_cr_line, format='%b-%Y')
        df.earliest_cr_line = df.earliest_cr_line.apply(lambda x: self.dict_earliest[x])

        # initial_list_status
        df.initial_list_status = df.initial_list_status.apply(lambda x: 0 if x in ['w'] else 1)

        # last_payment
        # Pasamos las fechas a una variable continua
        df.last_pymnt_d = pd.to_datetime(df.last_pymnt_d, format='%b-%Y')
        df.last_pymnt_d = df.last_pymnt_d.apply(lambda x: self.dict_last[x])

        # last_credit_pull_d
        df.last_credit_pull_d = pd.to_datetime(df.last_credit_pull_d, format='%b-%Y')
        df.last_credit_pull_d = df.last_credit_pull_d.apply(lambda x: self.dict_credit[x])

        # Obtenemos la X y la y
        target = df.loan_status
        features = df.drop('loan_status', axis=1)
        return features, target
    
    def fit_transform(self, X):
        return self.fit(X).transform(X)

In [0]:
lt = LoansTransformer()
x_train, y_train = lt.fit_transform(Loans_train)
x_train.head()

Unnamed: 0,loan_amnt,term,int_rate,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,acc_now_delinq
504302,16200.0,1,18.49,21,3,RENT,45000.0,Source Verified,102,debt_consolidation,GA,23.44,0.0,752,0.0,25.0,0.0,19531.0,57.6,48.0,1,16033.91,465.63,166.09,299.54,0.0,0.0,0.0,97,482.27,104,0.0,0.0
110291,10000.0,0,19.52,18,10,RENT,40000.0,Not Verified,73,credit_card,FL,12.09,0.0,719,1.0,11.0,0.0,7169.0,62.9,14.0,1,0.0,8861.98,5625.67,3236.31,0.0,0.0,0.0,95,370.0,104,0.0,0.0
717712,8300.0,0,16.99,17,10,RENT,32500.0,Not Verified,96,debt_consolidation,NJ,26.99,0.0,697,1.0,11.0,0.0,14660.0,57.0,19.0,1,6997.14,2063.33,1302.86,760.47,0.0,0.0,0.0,97,295.88,104,0.0,0.0
764505,12000.0,1,9.99,7,7,OWN,70000.0,Source Verified,95,debt_consolidation,CA,27.14,0.0,370,0.0,11.0,1.0,6026.0,42.7,30.0,0,10723.18,2032.62,1276.82,755.8,0.0,0.0,0.0,97,254.91,104,0.0,0.0
595541,4000.0,0,9.99,7,10,MORTGAGE,64000.0,Verified,100,major_purchase,NY,27.79,2.0,489,0.0,19.0,0.0,50000.0,27.2,36.0,0,3710.35,382.71,289.65,93.06,0.0,0.0,0.0,97,129.05,104,0.0,0.0


In [0]:
x_test, y_test = lt.transform(Loans_test)
x_test.head()

Unnamed: 0,loan_amnt,term,int_rate,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,total_pymnt,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,collections_12_mths_ex_med,acc_now_delinq
663191,12000.0,1,18.55,21,9,MORTGAGE,103000.0,Not Verified,98,debt_consolidation,TX,23.23,0.0,610,1.0,12.0,0.0,12790.0,57.4,19.0,1,11366.57,1529.28,633.43,895.85,0.0,0.0,0.0,97,308.33,104,0.0,0.0
431676,24000.0,0,8.9,4,9,OWN,110000.0,Source Verified,81,credit_card,FL,8.42,0.0,671,1.0,18.0,0.0,25346.0,22.9,33.0,0,0.0,24351.67,24000.0,351.67,0.0,0.0,0.0,77,23589.59,95,0.0,0.0
792651,3125.0,0,17.57,18,8,OWN,40000.0,Not Verified,94,credit_card,FL,22.69,0.0,736,0.0,6.0,0.0,659.0,50.7,23.0,1,2489.7,1007.74,635.3,372.44,0.0,0.0,0.0,97,112.31,104,0.0,0.0
405862,16400.0,1,21.18,22,6,MORTGAGE,56000.0,Not Verified,82,debt_consolidation,FL,15.84,0.0,710,1.0,10.0,0.0,16171.0,69.1,32.0,1,12478.71,9352.14,3921.29,5430.85,0.0,0.0,0.0,97,445.34,104,0.0,0.0
865700,10000.0,0,14.99,14,2,OWN,82400.0,Verified,91,other,PA,4.92,0.0,702,0.0,4.0,0.0,1442.0,7.5,9.0,1,7406.2,3804.38,2593.8,1210.58,0.0,0.0,0.0,97,346.61,104,0.0,0.0


In [0]:
print('x_train.shape = {}'.format(x_train.shape))
print('x_test.shape = {}'.format(x_test.shape))

x_train.shape = (577834, 33)
x_test.shape = (247647, 33)


In [0]:
joblib.dump(lt, 'lt_transformer.pkl')
joblib.dump(x_train, 'x_train.pkl')
joblib.dump(x_test, 'x_test.pkl')
joblib.dump(y_train, 'y_train.pkl')
joblib.dump(y_test, 'y_test.pkl')

['y_test.pkl']