In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Seleccionar ruta de la data
base=pd.read_csv('../data/modeling/02_preprocessed.csv',encoding= 'unicode_escape')
base.shape

In [None]:
variables_numericas=['MTO_CASH_DEPO',
                    'CTD_CASH_DEPO',
                    'MTO_CASH_RET',
                    'CTD_CASH_RET',
                    'CTD_TRXS_SINCTAAGENTE',
                    'ANTIGUEDAD',
                    'FLG_ACTECO_NODEF',
                    'FLG_PERFIL_CASH_DEPO_3DS',
                    'CTD_TRXSFUERAHORARIO',
                    'PROM_DEPODIARIOS',
                    'CTD_DIASDEPO',
                    'CTD_AN_NP_LSB',
                    'MTO_AN_NP_LSB',
                    'CTD_EVALS_PROP'
                    ] #---------filter3 sin imputer

columnas_flags=[
                'FLG_ZAED'
                ]

In [None]:
variables_modelo=variables_numericas+columnas_flags


In [None]:
base_train=base[(base.PERIODO<=202001) & (base.PERIODO>=201907)]
base_test=base[base.PERIODO==202002]

In [None]:
minmax = MinMaxScaler()
train_scalado = minmax.fit_transform(base_train[variables_modelo])
train_scalado = pd.DataFrame(train_scalado, columns=[variables_modelo])
train_scalado.head()

In [None]:
db = DBSCAN(eps=0.25, min_samples=20,metric='euclidean',metric_params=None, algorithm='auto', )
db.fit(train_scalado)
clusters = db.labels_

#### Calculamos la sensibilidad de los cluster ante cambios en EPS

In [None]:
# seleccionamos una muestra
train_scalado_sample=train_scalado.sample(n=int(train_scalado.shape[0]*0.5),random_state=1)
train_scalado_sample.shape

#### Fijamos el valor de eps y min_samples

In [None]:
db = DBSCAN(eps=0.4, min_samples=20, metric='euclidean',
            metric_params=None, algorithm='auto').fit(train_scalado)

In [None]:
cluster_DBSCAN = db.labels_

In [None]:
cluster_DBSCAN

In [None]:
scoreSilhoutte = metrics.silhouette_score(train_scalado, db.labels_, metric='euclidean')  
scoreSilhoutte

In [None]:
print(min(cluster_DBSCAN),max(cluster_DBSCAN))

In [None]:
n_clusters_ = len(set(cluster_DBSCAN)) - (1 if -1 in cluster_DBSCAN else 0)
n_clusters_

#### Asignamos los clusters

In [None]:
base_train['cluster_DBSCAN'] = cluster_DBSCAN
base_train.head()

In [None]:
minmax = MinMaxScaler()
test_scalado = minmax.fit_transform(base_test[variables_modelo])
test_scalado = pd.DataFrame(test_scalado, columns=[variables_modelo])
test_scalado.head()

In [None]:
X_train_val=train_scalado
y_train_val=base_train['cluster_DBSCAN']
X_train_val.head()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=0)

In [None]:
mejor_score = 0
mejor_n_estimators = None
mejor_modelo_rf = None
rf_error_train = []
rf_error_val = []

n_estimators_grid = np.linspace(2,80,40).astype(int)  # ~ [2, 4, 6, ..., 80]

for n_estimators in n_estimators_grid:
    modelo_rf = RandomForestClassifier(n_estimators=n_estimators, random_state=0)
    modelo_rf.fit(X_train, y_train)

    score_train = modelo_rf.score(X_train, y_train)
    rf_error_train.append(1 - score_train)

    score_val = modelo_rf.score(X_val, y_val)
    rf_error_val.append(1 - score_val)

    if score_val > mejor_score:
        mejor_score = score_val
        mejor_n_estimators = n_estimators
        mejor_modelo_rf = modelo_rf

modelo_rf = mejor_modelo_rf
modelo_rf.fit(X_train, y_train)

print ("Mejor valor de n_estimators :", mejor_n_estimators)
print ("Exactitud de RandomForest en conjunto de entrenamiento :", modelo_rf.score(X_train, y_train))
print ("Exactitud de RandomForest en conjunto de validación    :", modelo_rf.score(X_val, y_val))

In [None]:
pickle.dump(modelo_rf,open('../src/02_models/random_forest_model_39.model','wb'))

In [None]:
base_test['cluster_DBSCAN_pred'] = modelo_rf.predict(test_scalado)

In [None]:
base_test.to_csv(r'../data/modeling/04_predicted.csv', sep='|', index = False)