In [182]:
import joblib
import pandas as pd
from datetime import timedelta

def test_load_dfs(df_1,df_2):
    template_1 = joblib.load('tests/loans_template.pkl').dtypes
    template_3 = joblib.load('tests/recharges_template.pkl').dtypes
    assert (df_1.dtypes == template_1).all()
    assert (df_2.dtypes == template_3).all()

    return df_1,df_2


def load_dfs(paths=[
    'Brazil_DS_loans_2019-11-10_2019-12-05.csv',
    'Brazil_DS_prev_loans.csv',
    'Brazil_DS_recharges_2019-08-10_2019-12-05.csv']):
    
    loans_actual = pd.read_csv(paths[0],date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),parse_dates=['created_at','paid_at'])
    loans_prev = pd.read_csv(paths[1],date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),parse_dates=['created_at','paid_at'])
    loans = pd.concat([loans_actual,loans_prev],ignore_index=True)
    recharges = pd.read_csv(paths[2],date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),parse_dates=['recharge_timestamp'])
    #return  test_load_dfs(loans, recharges)
    return  loans, recharges


loans, recharges = load_dfs()

# Falta fazer

* Otimização dos parametros com k-fold
* Criar os cenários para escolher qual melhor modelo
* Criar os mocks de teste do feature eng
* Relátorio final
* Se der tempo fazer um CLI básico

# Exploratory data analysis

É interessante observar que os emprestimos não pagos não varia muito da mediana populacional, 5, entretanto utilizar essa informação pode gerar bias no modelo preditivo.


LEMBRETES:
    Criar intervalos de loans permitidos para cada usuário
    Fazes dois mocks para as funções de feature eng

Notas:
    Distribuição acumulado inad. 
        target_sum
        0    7044
        1    2463
        2       5
        4       1

#t_loans


In [22]:
#print('General Loans stats \n',t_loans['amount'].describe(),'\n',f"Median : {t_loans['amount'].median()}")
#print('Paid stats \n',t_loans.query('target == 0').groupby('uuid').sum('amount').sort_values('amount',ascending=False)['amount'].describe())
#print('Not Paid stats \n',t_loans.query('target == 1').groupby('uuid').sum('amount').sort_values('amount',ascending=False)['amount'].describe())

In [179]:
loans['created_at'].max()

Timestamp('2019-12-04 23:54:57.280521')

In [181]:
loans.columns

Index(['uuid', 'loan_id', 'amount', 'created_at', 'paid_at',
       'paid_days_interval', 'target'],
      dtype='object')

In [218]:
loans['created_at'].max()

Timestamp('2019-12-04 23:54:57.280521')

In [239]:
from dataclasses import dataclass
from typing import List
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
from datetime import datetime


@dataclass
class FeatureEngineering:
    train_loans: pd.DataFrame
    test_loans: pd.DataFrame
    train_recharges: pd.DataFrame
    days_to_default:int = 60
    

    def __post_init__(self):

        self.train_loans = self.__feature_eng_loans(self.train_loans.copy())
        self.test_loans = self.__feature_eng_loans(self.test_loans.copy())
        self.train_recharges = self.__feature_eng_recharges(self.train_recharges.copy())
        

    def __feature_eng_loans(self,df_l: pd.DataFrame) -> pd.DataFrame:
            '''
            Feature engineering for loans dataframe using historical data
            already_default: 0 if not defaulted, 1 if defaulted
            sum_amoun: sum amount of all previous loans paid
            count_loans: number of previous loans paid

            :param df_l: loans dataframe
            :return: feature engineered loans dataframe
            '''
            #calc days interval between dates
            df_l['paid_days_interval'] = (df_l['paid_at'] - df_l['created_at'])
            df_l['target'] = df_l['paid_days_interval'].apply(lambda x: 1 if x.days > self.days_to_default else 0)

            already_default = df_l.groupby('uuid').sum()['target'].apply(lambda x: 1 if x > 0 else 0)
            sum_amount = df_l.query('target == 0').groupby('uuid').sum()['amount']
            count_loans = df_l.query('target == 0').groupby('uuid').count()['amount']

            out_df = pd.DataFrame(df_l.groupby('uuid').count().index)
            out_df = out_df.join(already_default,on='uuid',how='left')
            out_df = out_df.join(sum_amount,on='uuid',how='left',rsuffix='_sum')
            out_df = out_df.join(count_loans,on='uuid',how='left',rsuffix='_count')
            out_df.columns = ['uuid','target','sum_amount','count_loans']
            return out_df
        
    def __feature_eng_recharges(self,df_r: pd.DataFrame) -> pd.DataFrame:
        '''
        Feature engineering for recharges dataframe using historical data
        freq_recharges_weekly: mean frequency recharges per week
        recharges_weekly: median frequency recharges per week
        delta_after_recharge: difference between balance after recharge and recharge value

        :param df_r: recharges dataframe
        :return: feature engineered recharges dataframe
        '''
        df_r['delta_after_recharge'] = df_r['balance_after_recharge'] - df_r['recharge_value']
        df_r['back_recharge_timestamp'] = pd.to_datetime(df_r['recharge_timestamp']) - pd.to_timedelta(7, unit='d')
        max_date = df_r['back_recharge_timestamp'].max()
        min_date = df_r['back_recharge_timestamp'].min()
        count_weeks = (max_date - min_date).days // 7
        weekly_df = df_r \
            .groupby(['uuid', pd.Grouper(key='recharge_timestamp', freq='W-MON')]) \
            .count() \
            .groupby('uuid')
        
        freq_recharges_weekly = weekly_df.sum()['recharge_value']/count_weeks
        recharges_weekly = weekly_df.median()['recharge_value']/count_weeks
        delta_after_recharges = df_r.groupby('uuid').median()['delta_after_recharge']
        
        out_df = pd.DataFrame(df_r.groupby('uuid').count().index)
        out_df = out_df.join(freq_recharges_weekly,on='uuid',how='left',rsuffix='_median')
        out_df = out_df.join(recharges_weekly,on='uuid',how='left',rsuffix='_median')
        out_df = out_df.join(delta_after_recharges,on='uuid',how='left')
        out_df.columns = ['uuid','freq_recharges_weekly','recharges_weekly','delta_after_recharges']
        return out_df

    @staticmethod
    def remove_perfect_correlation(df: pd.DataFrame,max_cor = 0.95) -> pd.DataFrame:
        ''''
        Remove columns with perfect correlation with other columns
        :param df: dataframe
        :return: dataframe without columns with perfect correlation
        
        corr_matrix = df.corr()
        corr_matrix.loc[:, :] = np.tril(corr_matrix.values, k=-1)
        cols_to_drop = corr_matrix.loc[:,(corr_matrix.abs() > max_cor).any()]
        print((corr_matrix.abs() > max_cor).any())
        df = df.drop(cols_to_drop,axis=1)
        TODO
        '''
        df = df.drop('count_loans',axis=1)
        return df

@dataclass
class ForecastDefault:
    '''
    docstring
    '''
    loans_hist: pd.DataFrame
    recharges_hist: pd.DataFrame
    estimators_list: List
    inicial_date: str = '2000-01-01'
    limit_date: str = '2019-12-05'
    days_to_default: int = 60
    fill_na:bool = True

    def __post_init__(self):
        date_format = '%Y-%m-%d'
        self.inicial_date = datetime.strptime(self.inicial_date,date_format)
        self.limit_date = datetime.strptime(self.limit_date,date_format)
        self.loans_hist['paid_at'] = self.loans_hist['paid_at'].fillna(self.limit_date + timedelta(days=1))
        last_date = self.limit_date - timedelta(days=self.days_to_default)
        

        self.loans = self.loans_hist[self.loans_hist['created_at'] > self.inicial_date].copy()
        self.recharges = self.recharges_hist[self.recharges_hist['recharge_timestamp'] > self.inicial_date].copy()
        

        train_loans = self.loans[self.loans['created_at'] < last_date]
        test_loans = self.loans[((self.loans['created_at'] > last_date) & (self.loans['created_at'] < self.limit_date))]
        train_recharges = self.recharges[self.recharges['recharge_timestamp'] < last_date]
        
        print('last_date',last_date)
        print('limit_date',test_loans['created_at'].max())

        
        print('train_loans: ',train_loans.shape)
        print('test_loans: ',test_loans.shape)


        fe = FeatureEngineering(train_loans,test_loans,train_recharges,days_to_default=self.days_to_default)

        train_df = fe.train_loans.merge(fe.train_recharges, on='uuid', how='left')
        train_df = fe.remove_perfect_correlation(train_df)

        if self.fill_na:
            train_df.fillna(0,inplace=True)

        self.model = FitModel(train_df,fe.test_loans[['uuid','target']],self.estimators_list)



from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

@dataclass
class FitModel:
    '''
    Class to fit the model
    :param train_df: dataframe with training data
    :param fe_test_df: dataframe containing forward targets to evaluate performance metrics
    :param estimator_list: list of estimators to use in the model
    '''
    train_df: pd.DataFrame
    fe_test_df: pd.DataFrame
    estimators_list: List


    def __post_init__(self):
        self.train_X, self.train_y = self.__under_sampling(self.train_df.copy())
        self.test_df = self.train_X.merge(self.fe_test_df,on='uuid',how='inner')
        

        self.train_X.drop('uuid',axis=1,inplace=True)
        self.test_df.drop('uuid',axis=1,inplace=True)

        self.test_X = self.test_df.drop('target',axis=1)
        self.test_y = self.test_df['target']

        print('train_X: ',self.train_X.shape)
        print('train_y: ',self.train_y.shape)
                

        #Vai mudar para aceitar varios estimadores mas agora vai sobreescrever o modelo
        for est in self.estimators_list:
            self.est = self.__fit(est)


        self.evaluate_model()
        
    def evaluate_model(self):
        print('Confusion Matrix: \n',confusion_matrix(self.test_y, self.predict(self.test_X)))
        print('Classification Report: \n',classification_report(self.test_y, self.predict(self.test_X)))
    
    def predict(self,X: pd.DataFrame) -> pd.Series:
        '''
        Predict using the fitted model
        :param X: dataframe with features
        :return: predictions
        '''
        #pred_df = pd.DataFrame(X.index)
        #pred_df['default_probability'] = self.est.predict(X)
        pred_df = self.est.predict(X)
        return pred_df


    def __fit(self,estimator):
        '''
        docstring
        '''
        est = estimator()
        est.fit(self.train_X, self.train_y)
        return est

    
    def __under_sampling(self,df: pd.DataFrame) -> pd.DataFrame:
        '''
        Under sampling of dataframe
        :param df: dataframe
        :return: undersampled dataframe
        '''
        rus = RandomUnderSampler(random_state=42)
        train_X, train_y = rus.fit_resample(
            train_df.drop(['target'],axis=1),train_df['target'])
        return train_X, train_y



s = ForecastDefault(loans_hist=loans, recharges_hist=recharges, estimators_list=[RandomForestClassifier],inicial_date='2019-01-01',days_to_default=1,limit_date='2019-12-05')
    

last_date 2019-12-04 00:00:00
limit_date 2019-12-04 23:54:57.280521
train_loans:  (18336, 5)
test_loans:  (625, 5)
train_X:  (238, 5)
train_y:  (238,)
Confusion Matrix: 
 [[0 0]
 [8 8]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.50      0.67        16

    accuracy                           0.50        16
   macro avg       0.50      0.25      0.33        16
weighted avg       1.00      0.50      0.67        16



  _warn_prf(average, modifier, msg_start, len(result))


In [64]:
PREDICT_FORWARD_DAYS = 60
last_date = loans['created_at'].max() - timedelta(days=60)
recharges['delta_after_recharge'] = recharges['balance_after_recharge'] - recharges['recharge_value']
loans['paid_days_interval'] = (loans['paid_at'] - loans['created_at']).dt.days.fillna(61)
loans['target'] = loans['paid_days_interval'].apply(lambda x: 1 if x > 60 else 0)

train_loans = loans[loans['created_at'] < last_date].copy()
test_loans = loans[loans['created_at'] > last_date].copy()


train_recharges = recharges[recharges['recharge_timestamp'] < last_date].copy()
test_recharges = recharges[recharges['recharge_timestamp'] > last_date].copy()


def feature_eng_loans(loans_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Feature engineering for loans dataframe using historical data
    already_default: 0 if not defaulted, 1 if defaulted
    sum_amoun: sum amount of all previous loans paid
    count_loans: number of previous loans paid
    '''
    already_default = loans_df.groupby('uuid').sum()['target'].apply(lambda x: 1 if x > 0 else 0)
    sum_amount = loans_df.query('target == 0').groupby('uuid').sum()['amount']
    count_loans = loans_df.query('target == 0').groupby('uuid').count()['amount']

    out_df = pd.DataFrame(loans_df.groupby('uuid').count().index)
    out_df = out_df.join(already_default,on='uuid',how='left')
    out_df = out_df.join(sum_amount,on='uuid',how='left',rsuffix='_sum')
    out_df = out_df.join(count_loans,on='uuid',how='left',rsuffix='_count')
    out_df.columns = ['uuid','target','sum_amount','count_loans']
    return out_df

def feature_eng_recharges(recharges_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Feature engineering for recharges dataframe using historical data
    freq_recharges_weekly: mean frequency recharges per week
    recharges_weekly: median frequency recharges per week
    delta_after_recharge: difference between balance after recharge and recharge value
    '''
    recharges_df['back_recharge_timestamp'] = pd.to_datetime(recharges_df['recharge_timestamp']) - pd.to_timedelta(7, unit='d')
    max_date = recharges_df['back_recharge_timestamp'].max()
    min_date = recharges_df['back_recharge_timestamp'].min()
    count_weeks = (max_date - min_date).days // 7
    weekly_df = recharges_df \
        .groupby(['uuid', pd.Grouper(key='recharge_timestamp', freq='W-MON')]) \
        .count() \
        .groupby('uuid')
    
    freq_recharges_weekly = weekly_df.sum()['recharge_value']/count_weeks
    recharges_weekly = weekly_df.median()['recharge_value']/count_weeks
    delta_after_recharges = recharges_df.groupby('uuid').median()['delta_after_recharge']
    
    out_df = pd.DataFrame(recharges_df.groupby('uuid').count().index)
    out_df = out_df.join(freq_recharges_weekly,on='uuid',how='left',rsuffix='_median')
    out_df = out_df.join(recharges_weekly,on='uuid',how='left',rsuffix='_median')
    out_df = out_df.join(delta_after_recharges,on='uuid',how='left')
    out_df.columns = ['uuid','freq_recharges_weekly','recharges_weekly','delta_after_recharges']
    return out_df

test_loans_fe = feature_eng_loans(test_loans)

train_loans_fe = feature_eng_loans(train_loans)
train_recharges_fe = feature_eng_recharges(train_recharges)
train_df = train_loans_fe.merge(train_recharges_fe,on='uuid',how='left')

#check nulls
print('NULLS :',train_df.isnull().sum())
train_df.fillna(0,inplace=True)
print('NA FILLED :',train_df.isnull().sum())

NULLS : uuid                      0
target                    0
sum_amount               79
count_loans              79
freq_recharges_weekly    34
recharges_weekly         34
delta_after_recharges    34
dtype: int64
NA FILLED : uuid                     0
target                   0
sum_amount               0
count_loans              0
freq_recharges_weekly    0
recharges_weekly         0
delta_after_recharges    0
dtype: int64


In [24]:
train_df.describe()

Unnamed: 0,target,sum_amount,count_loans,freq_recharges_weekly,recharges_weekly,delta_after_recharges
count,1033.0,1033.0,1033.0,1033.0,1033.0,1033.0
mean,0.115198,12.662149,2.32333,0.956559,0.170196,0.541723
std,0.319416,14.9216,2.280692,0.824489,0.087977,0.96441
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5.0,1.0,0.375,0.125,0.075
50%,0.0,10.0,2.0,0.75,0.125,0.285
75%,0.0,15.0,3.0,1.25,0.25,0.67
max,1.0,150.0,20.0,5.5,0.8125,20.05


In [25]:
train_df.corr()

Unnamed: 0,target,sum_amount,count_loans,freq_recharges_weekly,recharges_weekly,delta_after_recharges
target,1.0,-0.231116,-0.269322,-0.29051,-0.211319,-0.088737
sum_amount,-0.231116,1.0,0.975378,0.64719,0.485158,0.100453
count_loans,-0.269322,0.975378,1.0,0.638603,0.464965,0.101892
freq_recharges_weekly,-0.29051,0.64719,0.638603,1.0,0.712777,0.085745
recharges_weekly,-0.211319,0.485158,0.464965,0.712777,1.0,0.127432
delta_after_recharges,-0.088737,0.100453,0.101892,0.085745,0.127432,1.0


# SMOTE oversampling

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_tmp = train_df.reset_index()
df_tmp['index'] = le.fit_transform(df_tmp['uuid'])
train_X,train_y = oversample.fit_resample(
    df_tmp.drop(['uuid','target'],axis=1),
    df_tmp[['target']])
train_X['uuid'] = le.inverse_transform(train_X['index'])
train_X.set_index('uuid')
train_X.drop('index',axis=1,inplace=True)




test_df = train_X.merge(test_loans_fe[['uuid','target']],on='uuid',how='inner')
train_X.drop('uuid',axis=1,inplace=True)
test_df.drop('uuid',axis=1,inplace=True)

# Undersampling

In [26]:
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=2022)
train_X, train_y = under_sampler.fit_resample(
    train_df.drop(['target'],axis=1),
    train_df[['target']])

test_df = train_X.merge(test_loans_fe[['uuid','target']],on='uuid',how='inner')
train_X.drop('uuid',axis=1,inplace=True)
test_df.drop('uuid',axis=1,inplace=True)

In [27]:
len(s.sample_indices_)

238

# Lembrar de fazer o a otimização dos parametros

# Com multicolinearidade

In [28]:
#fit classification model on t_df random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report


rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2022)
rfc.fit(train_X, train_y)
y_pred = rfc.predict(test_df.drop(['target'],axis=1))
print('Accuracy: ',accuracy_score(test_df['target'], y_pred))
print('Confusion Matrix: \n',confusion_matrix(test_df['target'], y_pred))
print('Classification Report: \n',classification_report(test_df['target'], y_pred))


Accuracy:  0.5966386554621849
Confusion Matrix: 
 [[118  66]
 [ 30  24]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.64      0.71       184
           1       0.27      0.44      0.33        54

    accuracy                           0.60       238
   macro avg       0.53      0.54      0.52       238
weighted avg       0.68      0.60      0.63       238



  rfc.fit(train_X, train_y)


# Sem multicolinearidade

In [29]:
train_X.corr()

Unnamed: 0,sum_amount,count_loans,freq_recharges_weekly,recharges_weekly,delta_after_recharges
sum_amount,1.0,0.966692,0.65263,0.477287,0.219949
count_loans,0.966692,1.0,0.653344,0.466286,0.24284
freq_recharges_weekly,0.65263,0.653344,1.0,0.7316,0.215462
recharges_weekly,0.477287,0.466286,0.7316,1.0,0.266321
delta_after_recharges,0.219949,0.24284,0.215462,0.266321,1.0


In [97]:
#fit classification model on t_df random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline


#Ajust the model to pipeline
rfc = RandomForestClassifier()
rfc.fit(train_X.drop('count_loans',axis=1), train_y)
y_pred = rfc.predict(test_df.drop(['count_loans','target'],axis=1))
print('Accuracy: ',accuracy_score(test_df['target'], y_pred))
print('Confusion Matrix: \n',confusion_matrix(test_df['target'], y_pred))
print('Classification Report: \n',classification_report(test_df['target'], y_pred))

Accuracy:  0.5504201680672269
Confusion Matrix: 
 [[97 87]
 [20 34]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.53      0.64       184
           1       0.28      0.63      0.39        54

    accuracy                           0.55       238
   macro avg       0.56      0.58      0.52       238
weighted avg       0.70      0.55      0.59       238



  rfc.fit(train_X.drop('count_loans',axis=1), train_y)


In [31]:
under_sampler = RandomUnderSampler(random_state=2022)
train_X, train_y = under_sampler.fit_resample(
train_df.drop(['target'],axis=1),
train_df[['target']])

test_df = train_X.merge(test_loans_fe[['uuid','target']],on='uuid',how='inner')
train_X.drop('uuid',axis=1,inplace=True)
test_df.drop('uuid',axis=1,inplace=True)

pipeline = Pipeline([
('rfc', RandomForestClassifier(n_estimators=100, max_depth=3, random_state=2022)),
('under_sampler', RandomUnderSampler(random_state=2022)),
])
pipeline.fit(train_X, train_y)

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RandomForestClassifier(max_depth=3, random_state=2022)' (type <class 'sklearn.ensemble._forest.RandomForestClassifier'>) doesn't

# Pycaret

In [None]:
from pycaret.classification import *
s = setup(data=pd.concat([train_X, train_y], axis=1), target = 'target')

Unnamed: 0,Description,Value
0,session_id,6788
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(238, 6)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8559,0.8986,0.8278,0.8806,0.843,0.7106,0.7237,0.045
et,Extra Trees Classifier,0.832,0.8635,0.7903,0.8806,0.8168,0.6624,0.6839,0.04
gbc,Gradient Boosting Classifier,0.8316,0.8971,0.7903,0.8594,0.8126,0.6615,0.6754,0.013
ada,Ada Boost Classifier,0.8202,0.8924,0.7528,0.8649,0.7977,0.6385,0.6521,0.017
dt,Decision Tree Classifier,0.8085,0.8179,0.8153,0.8125,0.802,0.6174,0.6373,0.003
knn,K Neighbors Classifier,0.8018,0.893,0.6903,0.9038,0.755,0.5985,0.6303,0.006
lightgbm,Light Gradient Boosting Machine,0.7952,0.9053,0.7778,0.8172,0.7833,0.5875,0.6052,0.006
svm,SVM - Linear Kernel,0.789,0.0,0.7264,0.8841,0.7561,0.5722,0.6198,0.003
lr,Logistic Regression,0.7353,0.8776,0.7764,0.7168,0.7359,0.4704,0.4853,0.241
ridge,Ridge Classifier,0.729,0.0,0.8264,0.6921,0.7475,0.4605,0.4789,0.003


In [None]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…