# MAKING A FINAL CSV TO BE SENT TO MACHINE LEARNING MODEL

In [20]:
import pandas as pd
import numpy as np
from dvc import api
from io import StringIO

In [21]:
df_path = api.read('../dataset/data_prueba_limpia.csv', remote='dataset-track')

In [22]:
df_scoring = pd.read_csv(StringIO(df_path))

In [23]:
df_scoring.head()

Unnamed: 0,ingreso,antiguedad_laboral_meses,trabajos_ultimos_5,edad,crecimiento_ingreso,lugar_actual,target,target_cientos,target_regression,target_etiquetas
0,6620.78,18,7,32,45.11,YUCATÁN,0.54,540.0,0,desfavorable
1,21333.59,24,4,49,29.45,MÉXICO,0.64,640.0,0,aceptable
2,0.0,0,0,39,0.0,DISTRITO FEDERAL,0.26,260.0,0,desfavorable
3,21225.31,36,6,30,441.83,NUEVO LEÓN,0.62,620.0,0,aceptable
4,0.0,0,1,23,0.0,HIDALGO,0.4,400.0,0,desfavorable


# Modeling

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_validate

In [25]:
X = df_scoring.drop(['target', 'lugar_actual', 'target_etiquetas', 'target_cientos', 'target_regression'], axis = 1)
y = df_scoring['target']

In [26]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [27]:
results = cross_validate(pipeline ,X,y,return_train_score=True,cv=2)
results

{'fit_time': array([0.01660204, 0.01592994]),
 'score_time': array([0.00171423, 0.00180721]),
 'test_score': array([0.39174707, 0.10550588]),
 'train_score': array([1., 1.])}

In [28]:
train_score = np.mean(results['train_score'])
test_score = np.mean(results['test_score'])
# assert train_score > 0.7
# assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: 0.9999999992944921
Test Score: 0.24862647790013137


## Hyperparameter tunning

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
param_tunning = {'core_model__n_estimators': range(20,501,20)} 

In [31]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor())
])

In [32]:
grid_search= GridSearchCV(estimator,
                       param_grid = param_tunning,
                       scoring='r2',
                       cv=2) 

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.35,random_state= 42)

In [34]:
grid_search.fit(X_train, y_train)



 nan nan nan nan nan nan nan]


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('core_model',
                                        GradientBoostingRegressor())]),
             param_grid={'core_model__n_estimators': range(20, 501, 20)},
             scoring='r2')

In [35]:
final_result = cross_validate(grid_search.best_estimator_,X_train,y_train,return_train_score=True,cv=2)




In [36]:
train_score = np.mean(final_result['train_score'])
test_score = np.mean(final_result['test_score'])
# assert train_score > 0.7
# assert test_score > 0.65
print(f'Train Score: {train_score}')
print(f'Test Score: {test_score}')

Train Score: nan
Test Score: nan


In [37]:
grid_search.best_estimator_.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()),
  ('core_model', GradientBoostingRegressor(n_estimators=20))],
 'verbose': False,
 'imputer': SimpleImputer(),
 'core_model': GradientBoostingRegressor(n_estimators=20),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'imputer__verbose': 0,
 'core_model__alpha': 0.9,
 'core_model__ccp_alpha': 0.0,
 'core_model__criterion': 'friedman_mse',
 'core_model__init': None,
 'core_model__learning_rate': 0.1,
 'core_model__loss': 'squared_error',
 'core_model__max_depth': 3,
 'core_model__max_features': None,
 'core_model__max_leaf_nodes': None,
 'core_model__min_impurity_decrease': 0.0,
 'core_model__min_samples_leaf': 1,
 'core_model__min_samples_split': 2,
 'core_model__min_weight_fraction_leaf': 0.0,
 'core_model__n_estimators': 20,
 'core_model__n_iter_no_change': None,
 'core_model__random_state': None,
 'core_model__subsample': 1

In [38]:
estimator = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('core_model', GradientBoostingRegressor(n_estimators=220,
                                             alpha=0.9,
                                             ccp_alpha=0.0,
                                             criterion='friedman_mse',
                                             init=None,
                                             learning_rate=0.1,
                                             loss='squared_error',
                                             max_depth=3,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_iter_no_change=None,
                                             random_state=None,
                                             subsample=1.0,
                                             tol=0.0001,
                                             validation_fraction=0.1,
                                             verbose=0,
                                             warm_start=False))
])

In [39]:
estimator.fit(X_train,y_train)

Pipeline(steps=[('imputer', SimpleImputer()),
                ('core_model', GradientBoostingRegressor(n_estimators=220))])

In [40]:
estimator.score(X_test, y_test)

0.254694588356454

## Saving model

In [41]:
from joblib import dump

In [43]:
dump(estimator, 'model/model.pkl')

['../model/model.pkl']

In [55]:
def save_metrics_report(train_score: float, test_score: float, validation_score: float,
                        model: Pipeline):
    """Create a report file.

    Args:
        train_score (float): Train score of model.
        test_score (float): Test score of model.
        validation_score (float): Validation score of model.
        model (Pipeline): Modele pipeline.
    """
    with open('report.txt', 'w', encoding='utf-8') as report_file:

        report_file.write('# Model Pipeline Description'+'\n')

        for key, value in model.named_steps.items():
            report_file.write(f'### {key}:{value.__repr__()}'+'\n')

        report_file.write(f'### Train Score: {train_score}'+'\n')
        report_file.write(f'### Test Score: {test_score}'+'\n')
        report_file.write(f'### Validation Score: {validation_score}'+'\n')

In [57]:
validation_score = grid_search.best_estimator_.score(X_test, y_test)
save_metrics_report(train_score, test_score, validation_score, grid_search.best_estimator_)

In [58]:
y_test_pred = grid_search.best_estimator_.predict(X_test)