In [82]:
#to handle datasets
import pandas as pd
import numpy as np

#for plotting
#import matplotlib.pyplot as plt
#%matplotlib inline

#to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# for imbalanced datasets
from imblearn.over_sampling import RandomOverSampler

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# to ignore warnings
import warnings
warnings.simplefilter('ignore')

In [169]:
#load dataset
data = pd.read_excel(r'20190109123920.xlsx')
print(data.shape)
data.head()

(1016, 13)


Unnamed: 0,# Ordem,Estado,Anos atividade,Região,Receitas,Ativos,Montante,BR,Prazo,# pmts pagas,Taxa ind.,Taxa med,# ofertas
0,1,Pago,22,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",2500,A,6,6.0,0.032,0.03463,31
1,2,Incobrável,25,Estremadura e Ribatejo,"100,000 - 350,000","500,000-1,500,000",20000,B+,36,25.0,0.049,0.049287,33
2,3,Pago,15,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",10000,B+,24,24.0,0.046,0.048462,46
3,4,Pago,13,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",20000,A,48,48.0,0.042,0.044945,85
4,5,Pago,27,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",5000,C,6,6.0,0.087,0.088742,39


### Create binary target variable 

In [170]:
def target_var(df, var):
    df['target'] = [1 if row in ['Incobrável', 'Em recuperação'] else 0 for row in df[var]]
    return df

data = target_var(data, 'Estado')

In [171]:
def balanced_data_imputer(*, df: pd.DataFrame):
    df = df.copy()

    if df[df.columns].isnull().any().any():
        null_counts = df[df.columns].isnull().any()
        vars_ = {key: value for (key, value) in null_counts.items()
                 if value is True}

    # convert to array
    vars_ = [x for x in vars_.keys()]

    # extract columns with data missing
    df_columns_missing = df[vars_]

    # dependent and independent variables
    X = df.drop(labels=vars_ + [config.TARGET], axis=1)
    y = df[config.TARGET]

    # for handle imbalanced dataset by Oversampling
    ros = RandomOverSampler(random_state=0)

    # fit to data
    X_resampled, y_resampled = ros.fit_sample(X, y)

    # combined data
    df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                    df_columns_missing,
                    pd.DataFrame(y_resampled, columns=['target'])], axis=1)

    return df

In [172]:
data = BalancedDataImputer(df=data)
data.shape

(1986, 14)

In [173]:
data.head()

Unnamed: 0,# Ordem,Estado,Anos atividade,Região,Receitas,Ativos,Montante,BR,Prazo,Taxa ind.,Taxa med,# ofertas,# pmts pagas,target
0,1,Pago,22,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",2500,A,6,0.032,0.0346304,31,6.0,0
1,2,Incobrável,25,Estremadura e Ribatejo,"100,000 - 350,000","500,000-1,500,000",20000,B+,36,0.049,0.0492874,33,25.0,1
2,3,Pago,15,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",10000,B+,24,0.046,0.048462,46,24.0,0
3,4,Pago,13,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",20000,A,48,0.042,0.044945,85,48.0,0
4,5,Pago,27,Estremadura e Ribatejo,"100,000 - 350,000","100,000 - 350,000",5000,C,6,0.087,0.0887419,39,6.0,0


### Drop variables

In [85]:
variables = ['# Ordem','Estado','# ofertas']
data.drop(variables, axis = 1, inplace=True)

### Separate dataset into train and test

In [174]:
X_train, X_test, y_train, y_test = train_test_split(data, data.target, 
                                                    test_size = 0.2,
                                                    stratify=data.target,
                                                   random_state = 0)

X_train.shape, X_test.shape

((1588, 14), (398, 14))

### Missing values

In [87]:
vars_with_na = [var for var in data.columns if X_train[var].isnull().sum() >= 1 and X_train[var].dtypes == 'O']

for var in vars_with_na:
    print(var, np.round(X_train[var].isnull().mean(), 3), ' % missing values')

In [88]:
#fill missing values with mode
for var in vars_with_na:
    
    #calculate the mode:
    mode_val = X_train[var].mode()[0]
    
    #X_train
    X_train[var].fillna(mode_val, inplace = True)
    
    #X_test
    X_test[var].fillna(mode_val, inplace = True)

#check if we have no more missing values
[var for var in vars_with_na if X_test[var].isnull().sum() > 0]

[]

In [89]:
#gives a list with columns containing missing values
vars_with_na = [var for var in data.columns if X_train[var].isnull().sum()>= 1 and X_train[var].dtypes != 'O']

for var in vars_with_na:
    print(var, np.round(X_train[var].isnull().mean(),3), ' % missing values')

# pmts pagas 0.001  % missing values


In [90]:
#fill missing values with mode
for var in vars_with_na:
    
    #calculate the mode:
    mode_val = X_train[var].mode()[0]
    
    #X_train
    X_train[var].fillna(mode_val, inplace = True)
    
    #X_test
    X_test[var].fillna(mode_val, inplace = True)

#check if we have no more missing values
[var for var in vars_with_na if X_test[var].isnull().sum() > 0]

[]

### Misidentified columns types

In [91]:
# remove the sign %
X_train['Taxa ind.'] = [str(row).rstrip("%") for row in X_train['Taxa ind.']]
X_test['Taxa ind.'] = [str(row).rstrip("%") for row in X_test['Taxa ind.']]

# replace , for .
X_train['Taxa ind.'] = [row.replace(',','.') for row in X_train['Taxa ind.']]
X_test['Taxa ind.'] = [row.replace(',','.') for row in X_test['Taxa ind.']]

# divides per 100 if the value is above 1
X_train['Taxa ind.'] = [float(row)/100 if float(row) >=1 else row for row in X_train['Taxa ind.']]
X_test['Taxa ind.'] = [float(row)/100 if float(row) >=1 else row for row in X_test['Taxa ind.']]

# converts to float
X_train['Taxa ind.'] = X_train['Taxa ind.'].astype('float64')
X_test['Taxa ind.'] = X_test['Taxa ind.'].astype('float64')

### Categorical Variables

#### Rare Values

In [92]:
def find_frequent_labels(df, var, rare_perc):
    # finds the labels that are shared by more than a certain % of the houses in the dataset
    df = df.copy()
    tmp = df.groupby(var)['target'].count() / len(df)
    return tmp[tmp>rare_perc].index

In [93]:
cat_vars = ['Região', 'Receitas', 'Ativos', 'BR']
for var in cat_vars:
    frequent_ls = find_frequent_labels(X_train, var, 0.01)
    X_train[var] = np.where(X_train[var].isin(frequent_ls), X_train[var], 'Rare')
    X_test[var] = np.where(X_test[var].isin(frequent_ls), X_test[var], 'Rare')

#### Convert str to numbers

In [94]:
def replace_categories(train, test, var, target):
    ordered_labels = train.groupby([var])[target].mean().sort_values().index
    ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} 
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [95]:
for var in cat_vars:
    replace_categories(X_train, X_test, var, 'target')

### Imbalanced data

def imbalanced_data(df):
    df = df.copy()
    
    # dependent and independent variables
    X = df.drop(['target'], axis = 1)
    y = df.target
    
    # for handle imbalanced dataset by Oversampling
    ros = RandomOverSampler(random_state=0)
    
    # fit to data
    X_resampled, y_resampled = ros.fit_sample(X, y)

    # before balancing
    yvals, counts = np.unique(df['target'], return_counts=True)
    print('Classes in test set:',dict(zip(yvals, counts)))

    # after balancing
    yvals_ros, counts_ros = np.unique(y_resampled, return_counts=True)
    print('Classes in rebalanced test set:',dict(zip(yvals_ros, counts_ros)))
    
    # Concat imbalanced data
    df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                    pd.DataFrame(y_resampled, columns=['target'])], axis = 1)
    return df

X_train = imbalanced_data(X_train)
X_test = imbalanced_data(X_test)

### Final data

In [96]:
X_train.head()

Unnamed: 0,Anos atividade,Região,Receitas,Ativos,Montante,BR,Prazo,# pmts pagas,Taxa ind.,Taxa med,target
740,41,4,1,3,20000,5,36,7.0,0.0549,0.0549,0
581,23,7,3,1,10000,6,36,11.0,0.0748,0.0748,0
31,4,8,1,3,10000,5,12,12.0,0.0417,0.047348,0
856,4,7,2,3,15000,7,36,4.0,0.0856,0.0856,0
489,4,6,2,1,20000,1,24,13.0,0.0609,0.060685,0


In [97]:
X_train.to_csv('xtrain.csv', index=False)
X_test.to_csv('xtest.csv', index=False)