# Notebook for data-profiling of the di-f experiments

In [31]:
%load_ext autoreload
%autoreload 2
# The %load_ext autoreload and %autoreload 2 magic commands are used to automatically 
# reload modules when they are changed. This can be useful when you are developing code 
# in an interactive environment, as it allows you to see the changes you make to your modules 
# without having to restart the kernel.
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf

# for global initialization: NOT RECOMMENDED
#initialize(version_base=None, config_path="../src/conf")
#compose(config_name='config')

with initialize(version_base=None, config_path="../src/conf"):
    cfg = compose(config_name='config')
    print(cfg)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


{'general_ml': {'seed': 123, 'encoding': 'iso-8859-1', 'cloud': 'AWS'}, 'paths': {'project_dir': '...', 'raw_data': '${hydra:runtime.cwd}/data/raw', 'interim_data': '${hydra:runtime.cwd}/data/interim', 'processed_data': '${hydra:runtime.cwd}/data/processed', 'reports': '${hydra:runtime.cwd}/reports'}, 'cloud_paths': {'bucket_path': 'dif-b-democlient-sklearn', 'experiment_path': '${cloud_paths.bucket_path}/mxretailsalary1', 'mlflow_path': '${cloud_paths.experiment_path}/mlflow', 'reports_path': '${cloud_paths.experiment_path}/reports', 'rawdata_path': '${cloud_paths.experiment_path}/raw-data', 'dvc_path': '${cloud_paths.experiment_path}/dvc-store'}, 'file_names': {'raw_file': 'raw-data.csv', 'data_file': 'datafile.csv', 'train_features': 'train_features.csv', 'train_labels': 'train_labels.csv', 'validation_features': 'valid_features.csv', 'validation_labels': 'valid_labels.csv', 'test_features': 'test_features.csv', 'test_labels': 'test_labels.csv', 'data_profiling_report': 'data_profil

In [32]:
import pandas as pd
import numpy as np
import ydata_profiling as yp
import os

In [33]:
data = pd.read_csv(os.path.join('../data/raw', cfg.file_names.raw_file), 
                   #encoding=cfg.general_ml.encoding,
                   )
data.head()

Unnamed: 0,estado,municipio,businesses,employees,Payroll,expenditures,income,payroll_employee_day,profits_biz_day,sales_employee_day,employees_unit
0,Ags,Aguascalientes,11402,120923,4997.813674,129044.6433,159695.334,114.807074,6249.604612,3668.434872,10.6054201
1,Ags,Asientos,231,1647,29.092255,524.807734,621.055144,49.066071,807.5415528,1047.451838,7.12987013
2,Ags,Calvillo,591,4605,99.342787,1743.278407,2246.247441,59.92447,1897.09648,1354.956835,7.791878173
3,Ags,Cosio,104,468,4.577495,101.941048,136.364948,27.169366,797.1796419,809.383597,4.5
4,Ags,El Llano,104,860,14.802274,342.630524,427.291666,47.810963,1865.888567,1380.141041,8.269230769


In [34]:
#cutting dataset to this experiment
data=data[['estado',
        'municipio',
        'businesses',
        'employees',
        'income',
        'Payroll',
            ]]
data

Unnamed: 0,estado,municipio,businesses,employees,income,Payroll
0,Ags,Aguascalientes,11402,120923,159695.334000,4997.813674
1,Ags,Asientos,231,1647,621.055144,29.092255
2,Ags,Calvillo,591,4605,2246.247441,99.342787
3,Ags,Cosio,104,468,136.364948,4.577495
4,Ags,El Llano,104,860,427.291666,14.802274
...,...,...,...,...,...,...
2482,Zacatecas,Villa Garcia,147,785,160.114046,2.811121
2483,Zacatecas,Villa Gonzalez Ortega,149,875,289.174975,4.875017
2484,Zacatecas,Villa Hidalgo,96,604,219.851736,7.800028
2485,Zacatecas,Villanueva,339,2043,1282.438977,34.189010


In [36]:
#lets find cero
data['businesses']=data['businesses'].astype('float64')
data['employees']=data['employees'].astype('float64')
data.loc[data['businesses'] == 0, 'businesses'] = 1.0
data.loc[data['employees'] == 0, 'employees'] = 1.0

In [37]:
    
#Choose the Ml model to be applied, among: regression, Classifications, time_series, Clustering, NLP
from pycaret.regression import *

In [38]:
#droping rows with missing values in labels (because is regression)
data=data.dropna()
data

Unnamed: 0,estado,municipio,businesses,employees,income,Payroll
0,Ags,Aguascalientes,11402.0,120923.0,159695.334000,4997.813674
1,Ags,Asientos,231.0,1647.0,621.055144,29.092255
2,Ags,Calvillo,591.0,4605.0,2246.247441,99.342787
3,Ags,Cosio,104.0,468.0,136.364948,4.577495
4,Ags,El Llano,104.0,860.0,427.291666,14.802274
...,...,...,...,...,...,...
2482,Zacatecas,Villa Garcia,147.0,785.0,160.114046,2.811121
2483,Zacatecas,Villa Gonzalez Ortega,149.0,875.0,289.174975,4.875017
2484,Zacatecas,Villa Hidalgo,96.0,604.0,219.851736,7.800028
2485,Zacatecas,Villanueva,339.0,2043.0,1282.438977,34.189010


In [39]:
data['income_employee_day']=data['income']*1000000/data['employees']/360
data['employees_business'] = (data['employees']/data['businesses']+0.5).astype(int)
data['salary_employee_day']=data['Payroll']*1000000/data['employees']/360
data=data.drop(['municipio', 'employees', 'income','businesses','Payroll'],axis=1)
#data=data.drop(['employees_business','income_employee_day','salary_employee_day'],axis=1)
data.head()

Unnamed: 0,estado,income_employee_day,employees_business,salary_employee_day
0,Ags,3668.434872,11,114.807074
1,Ags,1047.451838,7,49.066071
2,Ags,1354.956835,8,59.92447
3,Ags,809.383597,5,27.169366
4,Ags,1380.141041,8,47.810963


In [40]:

model_to_find =   setup(data = data, #see above 
#                        log_experiment = True,
#                        experiment_name = f'{cfg.general_ml.client}-{cfg.general_ml.project}-{cfg.general_ml.experiment}',
#                        target = cfg.data_fields.label, # get the target label from cfg
                        target='salary_employee_day',
#                        session_id=cfg.general_ml.seed, # get the seed from config
#                        train_size = 1.0-float(cfg.data_pipeline.data_transform_params.percent_valid), #get %valid from cfg
                        transformation=True, 
#                        fix_imbalance = True, #8:2
                        normalize=True,
                        )

Unnamed: 0,Description,Value
0,Session id,202
1,Target,salary_employee_day
2,Target type,Regression
3,Original data shape,"(2196, 4)"
4,Transformed data shape,"(2196, 4)"
5,Transformed train set shape,"(1537, 4)"
6,Transformed test set shape,"(659, 4)"
7,Numeric features,2
8,Categorical features,1
9,Preprocess,True


In [41]:
best_model=compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,12.5492,819.547,25.75,0.6264,0.5461,0.815,0.654
rf,Random Forest Regressor,13.1044,782.0693,25.8085,0.6262,0.5692,0.8632,0.667
knn,K Neighbors Regressor,12.9955,839.7392,26.2245,0.6214,0.567,0.8356,0.169
gbr,Gradient Boosting Regressor,12.4846,785.9907,25.7443,0.6102,0.561,0.8794,0.386
et,Extra Trees Regressor,13.2537,845.3258,26.577,0.5919,0.5915,0.83,0.702
catboost,CatBoost Regressor,12.1837,870.4072,26.7938,0.5908,0.5401,0.7829,1.486
llar,Lasso Least Angle Regression,16.0331,930.7071,27.957,0.5644,0.766,2.0304,0.242
lasso,Lasso Regression,16.0332,930.7055,27.9571,0.5644,0.766,2.0304,0.174
xgboost,Extreme Gradient Boosting,12.9471,848.6823,26.9024,0.5638,0.572,0.8005,0.309
br,Bayesian Ridge,16.2218,930.9373,28.0227,0.561,0.7775,2.1167,0.389


Processing:   0%|          | 0/85 [00:00<?, ?it/s]