# 1. Carregamento dos dados

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
import sklearn
import math
from azureml.core import Experiment

# O azureml-core da versão 1.0.72 ou superior é requerido
# é necessário azureml-dataprep[pandas] na versão 1.1.34 ou superior
from azureml.core import Workspace, Dataset

In [None]:
#Trocar os códigos abaixo pelos da sua instância!
subscription_id = '4efa8ecf-981b-4481-87c1-05c138e80dcb'
resource_group = 'Aulas-FIAP'
workspace_name = 'WorkspaceMLAula'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [7]:
dataset = Dataset.get_by_name(workspace, name='Risco-de-Credito')
df = dataset.to_pandas_dataframe()
df.head()

Unnamed: 0,nome,renda,idade,etnia,sexo,casapropria,outrasrendas,estadocivil,escolaridade,default
0,"Simon, Rodriguez",4472.190323,42.036031,0,0,1,0,0,3,0
1,"Daniel, Castro",4592.774312,48.230662,1,0,1,0,1,2,0
2,"Myhue, Lin",2486.538807,56.881709,0,1,0,0,0,0,1
3,"Destiny, Richardson-Pacheco",2852.340117,51.684021,1,1,0,0,0,2,1
4,"Brittany, Cohen-Wilson",4703.782812,50.729078,1,1,1,0,1,2,0


# 2. Treinamento

## 2.1 - Separa dados em treino e teste

In [33]:
x_df = df.loc[:,["renda", "idade", "etnia", "sexo", "casapropria", "outrasrendas", "estadocivil", "escolaridade"] ]
y_df = df.loc[:,"default"]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=66)

## 2.2 Random Forest 

Prepara ambiente na nuvem para logs

In [56]:
experiment = Experiment(workspace=workspace, name="risco-credito-experimentos-RF")
run = experiment.start_logging()
run.log("Valor Parametro", 1.0)

Treina

In [57]:
clf = rfc()
clf.fit(X=X_train, y=y_train)
clf.independentcols = x_df.columns



In [58]:
clf_acuracia = clf.score(X=x_df, y=y_df)
print("Modelo 01 (classificador), criado com acurácia de: [{0}]".format(clf_acuracia))
run.log("acuracia", clf_acuracia)

Modelo 01 (classificador), criado com acurácia de: [0.96047]


Demais logs

In [59]:
run.log("Versao sklearn", sklearn.__version__)
run.log("criterion", clf.criterion)
run.log("n_estimators", clf.n_estimators)
run.log("min_samples_leaf", clf.min_samples_leaf)
run.log("max_depth", clf.max_depth)
run.log_list("Inputs", list(x_df.columns) )

Salva modelo e encerra Logger

In [60]:
model_name = "model_risco_RF01.pkl"
filename = "outputs/" + model_name

joblib.dump(value=clf, filename=filename)
run.upload_file(name=model_name, path_or_stream=filename)
run.complete()

## 2.3 Linerar Model Ridge

Prepara ambiente na nuvem para logs

In [47]:
experiment = Experiment(workspace=workspace, name="risco-credito-experimentos-Ridge")

In [48]:
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for alpha in alphas:
    print("alpha_value: {0}".format(alpha ) )
    run = experiment.start_logging()
    run.log("alpha_value", alpha)

    model = Ridge(alpha=alpha)
    model.fit(X=X_train, y=y_train)
    model.independentcols = x_df.columns
    y_pred = model.predict(X=X_test)
    acuracia = model.score(X=x_df, y=y_df)
    run.log("acuracia", acuracia)
    run.log("Versao sklearn", sklearn.__version__)
    rmse = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))
    run.log("rmse", rmse)

    model_name = "model_alpha_" + str(alpha) + ".pkl"
    filename = "outputs/" + model_name

    joblib.dump(value=model, filename=filename)
    run.upload_file(name=model_name, path_or_stream=filename)
    run.complete()

alpha_value: 0.1
alpha_value: 0.2
alpha_value: 0.3
alpha_value: 0.4
alpha_value: 0.5
alpha_value: 0.6
alpha_value: 0.7
alpha_value: 0.8
alpha_value: 0.9
alpha_value: 1.0


-----

# FIM