## Prueba de Modelos ML

### 1.) Random Forest

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
df = pd.read_csv("df_limpio.csv")

In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

In [52]:
X = df[['delivery_person_age', 'delivery_person_ratings', 'multiple_deliveries', 'total_distance', 'weather_category',
       'road_traffic_category', 'vechicle_type_category', 'vehicle_condition',
       'festival_bool', 'city_category',
       'valley_or_peak']]
Y = df['time_taken_min']

In [55]:
from sklearn.model_selection import train_test_split

X = df.drop(columns= 'time_taken_min')
Y = df['time_taken_min']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size = 0.20,
                                                    random_state=42)

In [54]:
scaler = MinMaxScaler()

# Escalar las columnas restantes
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns= X_train.columns)

In [56]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [57]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=12)
rf.fit(X_train, y_train)

y_train_predict = rf.predict(X_train_scaled)
y_test_predict = rf.predict(X_test_scaled)

training_rmse_rf = np.sqrt(mse(y_train_predict, y_train))
test_rmse_rf = np.sqrt(mse(y_test_predict, y_test))

print("MSE in train:",training_rmse_rf)
print("MSE in test:",test_rmse_rf)
print("MAE in test:", mean_absolute_error(y_test, y_test_predict))
print("MAPE in test:", mean_absolute_percentage_error(y_test, y_test_predict))

MSE in train: 10.363270001389807
MSE in test: 10.506780259706142
MAE in test: 8.09874067023386
MAPE in test: 0.36608107283713076


In [None]:
rf.shape

In [58]:
from sklearn.model_selection import GridSearchCV

grid_random_forest = {"n_estimators": range(1,250)
                     #"max_depth": [3,4,5,6,10,15],
                     #"max_features": ["sqrt", 3, 4, 5, 6]                          
                     }

In [59]:
model_rf = GridSearchCV(rf, grid_random_forest)

In [60]:
model_rf.fit(X_train_scaled, y_train)

In [None]:
model_rf.best_params_

{'max_depth': 15, 'max_features': 6, 'n_estimators': 250}

In [None]:
y_rf_grid_pred  = model_rf.best_estimator_.predict(X_test_scaled)

In [None]:
print("MAE in train:", mean_absolute_error(y_train, rf.predict(X_train_scaled)))
print("MAPE in train:",mean_absolute_percentage_error(y_train, rf.predict(X_train_scaled)))
print("MAE in test:", mean_absolute_error(y_test, y_rf_grid_pred))
print("MAPE in test:",mean_absolute_percentage_error(y_test, y_rf_grid_pred))

MAE in train: 7.981097196984167
MAPE in train: 0.35988753548454405
MAE in test: 3.0139527640294053
MAPE in test: 0.12854598918019933


### 2.) XGBoost

In [48]:
import xgboost

xgb_reg = xgboost.XGBRegressor(random_state=42)

xgb_reg.fit(X_train_scaled, y_train)
y_pred = xgb_reg.predict(X_test_scaled)

print("MAE in train:", mean_absolute_error(y_train, xgb_reg.predict(X_train_scaled)))
print("MAPE in train:",mean_absolute_percentage_error(y_train, xgb_reg.predict(X_train_scaled)))

print("MAE in test:", mean_absolute_error(y_test, y_pred))
print("MAPE in test:",mean_absolute_percentage_error(y_test, y_pred))

MAE in train: 2.5849859383084874
MAPE in train: 0.1107567058993458
MAE in test: 3.0562606049014107
MAPE in test: 0.12959493083469428


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score, mean_absolute_error

In [42]:
grid_xg_boost = {"n_estimators": [24],
                    "learning_rate": [0.3],
                    "max_depth": [7]
}                        
                     

In [43]:
model = GridSearchCV(xgb_reg, grid_xg_boost, scoring= make_scorer(mean_absolute_error, greater_is_better=False))

In [44]:
model.fit(X_train, y_train)

In [45]:
model.best_params_

{'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 24}

In [46]:
y_xg_grid_pred  = model.predict(X_test_scaled)

In [47]:
print("MAE in test:", mean_absolute_error(y_test, y_xg_grid_pred))
print("MAPE in test:",mean_absolute_percentage_error(y_test, y_xg_grid_pred))

MAE in test: 7.2574322542072505
MAPE in test: 0.3152656218698612
