In [15]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys
sys.path.append('..')
from Scripts.utils import (train_model_with_grid_search, load_and_predict, run_boruta, run_lasso,scale_data)
from sklearn.model_selection import train_test_split

# show all columns 
pd.set_option('display.max_columns', None)
# show entire rows
pd.set_option('display.max_rows', None)
# show all values in a cell
pd.set_option('display.max_colwidth', None)

In [16]:
# importing the datasets
clean_data_sp = pd.read_excel('../Data/clean_data_sp_w_revenue.xlsx')
clean_data_rj = pd.read_excel('../Data/clean_data_rj.xlsx')

In [17]:
# classification with random forest
from sklearn.ensemble import RandomForestClassifier

X = clean_data_rj.drop(columns=['potencial','nome'], axis=1)
y = clean_data_rj['potencial']
X_sp = clean_data_sp[X.columns]

In [18]:
# Definir o modelo
model = RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criar o objeto Boruta
boruta_selector, selected_features = run_boruta(X_train=X_train, y_train=y_train, X_test=X_test, estimator=model, max_iter=100, random_state=42)

X_filtered = X[selected_features]
X_sp_filtered = clean_data_sp[selected_features]

param_grid = {
    'model__n_estimators': [400],
    'model__max_depth': [None, 1],
    'model__min_samples_split': [2, 3]
}

train_model_with_grid_search(X_filtered, y, model, param_grid, test_size=0.2, random_state=42, balance_training='under', save_path='../Models/rf_model_potential_rj.joblib')

                   feature  rank
0                população     1
30         propDomiciliosC     1
29         propDomiciliosB     1
28         propDomiciliosA     1
27           rendaMediaPop     1
26              rendaTotal     1
24  propDomiciliosCriancas     1
23              domicilios     1
21             domiciliosB     1
20             domiciliosA     1
19              popDe25a49     1
18             faturamento     1
17              rendaMedia     1
31         propDomiciliosD     1
32         propDomiciliosE     1
7               popDe50a59     1
13            domiciliosC1     1
12            domiciliosB2     1
11            domiciliosB1     1
10            domiciliosA2     1
9             domiciliosA1     1
8              popMaisDe60     1
6               popDe35a49     1
5               popDe25a34     1
4               popDe20a24     2
3               popDe15a19     2
14            domiciliosC2     3
1                  popAte9     3
22                popAte14     5
2         

In [19]:
param_grid = {
    'model__n_estimators': [400],
    'model__max_depth': [None, 1, 2],
    'model__min_samples_split': [2, 3]
}

# Chamar a função para treinar o modelo com a busca em grade e salvar
train_model_with_grid_search(X, y, model, param_grid, test_size=0.2, random_state=42, balance_training='under', scaling_method='standard', save_path='../Models/rf_model_potential_rj.joblib')

Best parameters: {'model__max_depth': None, 'model__min_samples_split': 3, 'model__n_estimators': 400}
Model trained and saved successfully!
Score on training set: 1.0
Score on test set: 0.9032258064516129


In [20]:
# adicione uma coluna potencial_pred no clean_data_rj
clean_data_sp = pd.read_excel('../Data/clean_data_sp_w_revenue.xlsx')
clean_data_sp['potencial'] = load_and_predict(X_sp, '../Models/rf_model_potential_rj.joblib')

In [21]:
# export the data
clean_data_sp.to_excel('../Data/clean_data_sp_w_revenue_w_potential_1.xlsx', index=False)