In [61]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

In [62]:
Dataset = pd.read_csv('Dataset_modelo.csv')
Dataset

Unnamed: 0,Year,Country Code 1,Country 1,Country Code,Country 2,Diff,Value,Five-year change immigrants,Emigrants,International migrants,...,Share below $3.65 a day_2,Share below $6.85 a day_2,Share below $10 a day_2,Share below $20 a day_2,Share below $30 a day_2,Share below $40 a day_2,40% of median - share of population below poverty line_2,50% of median - share of population below poverty line_2,60% of median - share of population below poverty line_2,Are Borders
0,1990,ALB,Albania,AFG,Afghanistan,0.0,0.0,0.0,180204.0,66013.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0
1,1990,DZA,Algeria,AFG,Afghanistan,0.0,0.0,0.0,921665.0,273954.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0
2,1990,AND,Andorra,AFG,Afghanistan,0.0,0.0,0.0,3740.0,38891.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0
3,1990,AGO,Angola,AFG,Afghanistan,0.0,0.0,0.0,824886.0,33517.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0
4,1990,ARG,Argentina,AFG,Afghanistan,0.0,20.0,0.0,430155.0,1649919.0,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259021,2020,VNM,Vietnam,ALB,Albania,0.0,0.0,3974.0,3392025.0,76767.0,...,0.797915,10.93692,30.713816,76.654225,92.422544,97.425297,3.97702,9.215007,16.576506,0
259022,2020,YEM,Yemen,ALB,Albania,0.0,0.0,7231.0,1301166.0,387113.0,...,0.797915,10.93692,30.713816,76.654225,92.422544,97.425297,3.97702,9.215007,16.576506,0
259023,2020,ZMB,Zambia,ALB,Albania,0.0,0.0,55848.0,200700.0,187955.0,...,0.797915,10.93692,30.713816,76.654225,92.422544,97.425297,3.97702,9.215007,16.576506,0
259024,2020,ZWE,Zimbabwe,ALB,Albania,0.0,0.0,15659.0,1243314.0,416141.0,...,0.797915,10.93692,30.713816,76.654225,92.422544,97.425297,3.97702,9.215007,16.576506,0


In [63]:
Dataset = pd.concat([Dataset], ignore_index=True)
shuffled_data = Dataset.sample(frac=1).reset_index(drop=True)

In [64]:
scaler = StandardScaler()
columns_to_standardize = Dataset.columns[5:58]
Dataset[columns_to_standardize] = scaler.fit_transform(Dataset[columns_to_standardize])

In [65]:
joblib.dump(scaler, "scaler_entrenado.joblib")

['scaler_entrenado.joblib']

In [66]:
X = Dataset[Dataset.columns[6:58]]
y = Dataset['Diff']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=26)

In [67]:
model = GradientBoostingRegressor()

In [68]:
param_grid = {
    'n_estimators': [250],
    'learning_rate': [0.5],
    'max_depth': [7],
    'min_samples_split': [5],
    'min_samples_leaf': [4],
    'max_features': [1.0],
    'subsample': [0.8],
    'loss': ['squared_error'],
}

In [69]:
grid_search = GridSearchCV(model, param_grid, cv=2)
grid_search.fit(X_train, y_train)

print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

Mejores hiperparámetros encontrados:
{'learning_rate': 0.5, 'loss': 'squared_error', 'max_depth': 7, 'max_features': 1.0, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 250, 'subsample': 0.8}


In [70]:
joblib.dump(best_model, "modelo_entrenado.joblib")

['modelo_entrenado.joblib']

In [71]:
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 0.6769553887807316
R^2 Score: 0.42426251711586194


In [72]:
cv_scores = cross_val_score(best_model, X, y, cv=2)
print("Cross Validation Scores:", cv_scores)
print("Cross Validation Accuracy:", cv_scores.mean())

Cross Validation Scores: [0.25746911 0.20423513]
Cross Validation Accuracy: 0.23085212240915676
