In [1]:
from lightgbm import LGBMRegressor
import pandas as pd
from typing import Dict, Tuple
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import logging
from sklearn.model_selection import GridSearchCV



In [2]:
catalog.list()

In [7]:
df=catalog.load('model_input_table')

In [4]:
df_enfermedades=catalog.load('processed_enfermedades')
df_enfermedades.head(10)

Unnamed: 0,Enfermedad,Sintoma,Frecuencia
0,Enfermedad de Alexander,Macrocephaly,Muy frecuente (99-80%)
1,Enfermedad de Alexander,Intellectual disability,Muy frecuente (99-80%)
2,Enfermedad de Alexander,Seizure,Muy frecuente (99-80%)
3,Enfermedad de Alexander,Spasticity,Muy frecuente (99-80%)
4,Enfermedad de Alexander,Agenesis of corpus callosum,Muy frecuente (99-80%)
5,Enfermedad de Alexander,Hyperreflexia,Muy frecuente (99-80%)
6,Enfermedad de Alexander,Megalencephaly,Muy frecuente (99-80%)
7,Enfermedad de Alexander,Failure to thrive,Muy frecuente (99-80%)
8,Enfermedad de Alexander,Frontal bossing,Muy frecuente (99-80%)
9,Enfermedad de Alexander,Nausea and vomiting,Muy frecuente (99-80%)


In [8]:
df.head(10)

Unnamed: 0,id_x,shuttle_location,shuttle_type,engine_type,engine_vendor,engines,passenger_capacity,cancellation_policy,crew,d_check_complete,...,review_scores_crew,review_scores_location,review_scores_price,number_of_reviews,reviews_per_month,id_y,company_rating,company_location,total_fleet_count,iata_approved
0,63561,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,False,...,10.0,9.0,10.0,133,1.65,35029,1.0,Niue,4.0,False
1,63561,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,False,...,10.0,9.0,10.0,133,1.65,35029,1.0,Niue,4.0,False
2,63561,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,False,...,10.0,9.0,10.0,133,1.65,35029,1.0,Niue,4.0,False
3,63561,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,False,...,10.0,9.0,10.0,133,1.65,35029,1.0,Niue,4.0,False
4,53260,Niue,Type V5,Quantum,"Banks, Wood and Phillips",1.0,2,strict,1.0,False,...,10.0,9.0,10.0,37,0.48,35029,1.0,Niue,4.0,False
5,53260,Niue,Type V5,Quantum,"Banks, Wood and Phillips",1.0,2,strict,1.0,False,...,10.0,9.0,10.0,37,0.48,35029,1.0,Niue,4.0,False
6,53260,Niue,Type V5,Quantum,"Banks, Wood and Phillips",1.0,2,strict,1.0,False,...,10.0,9.0,10.0,37,0.48,35029,1.0,Niue,4.0,False
7,53260,Niue,Type V5,Quantum,"Banks, Wood and Phillips",1.0,2,strict,1.0,False,...,10.0,9.0,10.0,37,0.48,35029,1.0,Niue,4.0,False
8,51019,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,flexible,1.0,False,...,10.0,9.0,9.0,10,0.15,35029,1.0,Niue,4.0,False
9,51019,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,flexible,1.0,False,...,10.0,9.0,9.0,10,0.15,35029,1.0,Niue,4.0,False


In [9]:
X_train=catalog.load('X_train')

In [10]:
y_train=catalog.load('y_train')

In [12]:
y_train.head(10)

In [13]:
def train_linear_regression_model(X_train: pd.DataFrame, y_train: pd.Series) -> LinearRegression:
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return regressor

In [15]:
lr=train_linear_regression_model(X_train, y_train)

In [16]:
lr

LinearRegression()

In [17]:
def evaluate_model(regressor: LinearRegression):
    X_test=catalog.load('X_test')
    y_test=catalog.load('y_test')
    y_pred = regressor.predict(X_test)
    score = r2_score(y_test, y_pred)  #Correlacion
    print('Model has a coefficient R^2 of %.3f on test data.' %score)
    

In [18]:
evaluate_model(lr)

Model has a coefficient R^2 of 0.462 on test data.


In [19]:
def train_random_forest_regression_model(X_train: pd.DataFrame, y_train: pd.Series) -> RandomForestRegressor:
    var_max_depth= catalog.load('params:model_options.max_depth')
    rf_regressor = RandomForestRegressor(max_depth=var_max_depth)
    rf_regressor.fit(X_train, y_train)
    return rf_regressor

In [20]:
fr=train_random_forest_regression_model(X_train, y_train)

In [21]:
fr

RandomForestRegressor(max_depth=25)

In [22]:
evaluate_model(fr)

Model has a coefficient R^2 of 0.797 on test data.


In [35]:
params={'max_depth':5, 'n_estimators': 1000, 'learning_rate': 0.05}

In [36]:
def train_lgbm_regression_model(X_train: pd.DataFrame, y_train: pd.Series, params:Dict) -> LGBMRegressor:
    
    lgbm_regressor = LGBMRegressor(**params)
    lgbm_regressor.fit(X_train, y_train)
    return lgbm_regressor

In [40]:
lgbm=train_lgbm_regression_model(X_train, y_train,params)

In [41]:
lgbm

LGBMRegressor(learning_rate=0.05, max_depth=5, n_estimators=1000)

In [42]:
evaluate_model(lgbm)

Model has a coefficient R^2 of 0.744 on test data.


In [28]:
params_grid = { 
    'max_depth': [15, 20, 25,30]
}

In [29]:
reg_test=GridSearchCV (RandomForestRegressor(), param_grid=params_grid )

In [30]:
reg_test.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [15, 20, 25, 30]})

In [38]:
reg_test.best_params_

In [39]:
reg_test.best_score_

Model has a coefficient R^2 of 0.717 on test data.
