In [1]:
import pandas as pd
import numpy as np
import sklearn as ska

In [2]:
#importando a base para o dataframe, com separador por ";"
df = pd.read_csv("credit_train_label.csv", sep=';', low_memory=False)

In [3]:
#quantidade de linhas e coluna do dataframe
df.shape

(50000, 13)

In [4]:
#cabeçalho do dataframe
df.head()

Unnamed: 0,X,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30.59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60.89DaysPastDueNotWorse,NumberOfDependents,Gasto com dependente,SeriousDlqin2yrs
0,1,0.766127,45,2,0.802982,9120,13,0,6,0,2,4560,1
1,2,0.957151,40,0,0.121876,2600,4,0,0,0,1,2600,0
2,3,0.65818,38,1,0.085113,3042,2,1,0,0,0,3042,0
3,4,0.23381,30,0,0.03605,3300,5,0,0,0,0,3300,0
4,5,0.907239,49,1,0.024926,63588,7,0,1,0,0,63588,0


In [5]:
#importando biblioteca que divide a base em treino e teste
from sklearn.model_selection import train_test_split

In [6]:
#definindo as colunas X e a variável target Y
X = df.iloc[:,0:11]
y = df.iloc[:,12]
#dividindo a base em treino e teste. Treino 70% e teste 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
#analisando a divisão das bases
print (X_train.shape)
print (X_test.shape)

(35000, 11)
(15000, 11)


In [8]:
#importando bibliotecas para fazer o RandonForest, probabilidade e o GridSerchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [18]:
#fazendo um chute inicial com 100 árvores, profundidade 7 e utilizando 5 variáveis
rf = RandomForestClassifier(n_estimators=200, max_depth=5, max_features=9)
#aplicando o modelo
rf.fit(X_train, y_train)
#dando predict na base de teste e atribuindo a predictions
predictions = rf.predict(X_test)
#calculando a probablilidade
auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])
#imprimindo a probabilidade
print("AUC: %.4f" % auc)

AUC: 0.8475


In [10]:
#carregando novo dataframe onde irei aplicar o modelo gerado
newdf = pd.read_csv("credit_test_features.csv", sep=";")

In [11]:
#quantidade de linhas e coluna do novo dataframe
newdf.shape

(100000, 11)

In [12]:
#dando predict na nova base e atribuindo a predictions
predictions = rf.predict(newdf)

In [13]:
#calculando a probabilidade da nova base
probabilidade = rf.predict_proba(newdf)

In [14]:
#criando uma coluna no dataframe da novabase e acrescentando a probabilidade de dar 1
newdf["pred"] = probabilidade[:,1]

In [15]:
#Salvando novo dataframe em csv separado por "," e com cabeçalho
newdf.to_csv('novabase.csv', sep=',', header=True)

In [16]:
#Tunando o modelo alterando a quantidade de árvores(n_estimators), profundidade(max_depht) e a quantidade de variáveis (max_features)
#Pode demorar um pouco de acordo com a quantidade de hiperparamentros escolhidos
tuned_parameters = [{'n_estimators': [100,200],
                     'max_depth': [1,3,5,7],
                     'max_features': [5,7,11]}]

clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=3, scoring='accuracy')
#gerando modelo tunado com os melhores hiperparâmetros
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [100, 200], 'max_depth': [1, 3, 5, 7], 'max_features': [5, 7, 11]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [17]:
# imprindo descritivo do tunning com os melhores hiperparametros
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
from sklearn.metrics import accuracy_score
auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
print("Probabilidade: %.4f" % auc)
print()

Best parameters set found on development set:

{'max_depth': 7, 'max_features': 5, 'n_estimators': 100}

Grid scores on development set:

0.933 (+/-0.000) for {'max_depth': 1, 'max_features': 5, 'n_estimators': 100}
0.933 (+/-0.000) for {'max_depth': 1, 'max_features': 5, 'n_estimators': 200}
0.933 (+/-0.000) for {'max_depth': 1, 'max_features': 7, 'n_estimators': 100}
0.933 (+/-0.000) for {'max_depth': 1, 'max_features': 7, 'n_estimators': 200}
0.933 (+/-0.000) for {'max_depth': 1, 'max_features': 11, 'n_estimators': 100}
0.933 (+/-0.000) for {'max_depth': 1, 'max_features': 11, 'n_estimators': 200}
0.934 (+/-0.001) for {'max_depth': 3, 'max_features': 5, 'n_estimators': 100}
0.935 (+/-0.001) for {'max_depth': 3, 'max_features': 5, 'n_estimators': 200}
0.935 (+/-0.000) for {'max_depth': 3, 'max_features': 7, 'n_estimators': 100}
0.935 (+/-0.001) for {'max_depth': 3, 'max_features': 7, 'n_estimators': 200}
0.935 (+/-0.001) for {'max_depth': 3, 'max_features': 11, 'n_estimators': 100}
0

In [20]:
#carregando novo dataframe onde irei aplicar o modelo gerado
newdf = pd.read_csv("credit_test_features.csv", sep=";")

In [21]:
#dando o predict do modelo tunado na nova base e atribuindo a predictions
predictions = clf.predict(newdf)

In [22]:
#calculando a probabilidade com o modelo tunado
probabilidade = clf.predict_proba(newdf)

In [23]:
#criando uma coluna no dataframe da novabase e acrescentando a probabilidade de dar 1
newdf["pred"] = probabilidade[:,1]

In [24]:
#Salvando novo dataframe em csv separado por "," e com cabeçalho
newdf.to_csv('novabasetunada.csv', sep=',', header=True)