In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [9]:

filepath_clean = "data/clean_files/all_plata_melted.xlsx"

clean_plata_df = pd.read_excel(filepath_clean)

clean_plata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2883 entries, 0 to 2882
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   fecha_compra   2883 non-null   datetime64[ns]
 1   codigo         2883 non-null   object        
 2   talla          582 non-null    object        
 3   peso           2874 non-null   float64       
 4   costo_gramo    2874 non-null   float64       
 5   costo          2883 non-null   float64       
 6   pvp            2883 non-null   float64       
 7   detalle        2883 non-null   object        
 8   proveedor      2883 non-null   object        
 9   nota           610 non-null    object        
 10  source         2883 non-null   object        
 11  fecha_ingreso  2883 non-null   datetime64[ns]
 12  count_items    2883 non-null   int64         
dtypes: datetime64[ns](2), float64(4), int64(1), object(6)
memory usage: 292.9+ KB


In [10]:
model_df = clean_plata_df[["pvp", "costo_gramo", "peso" ]].dropna()

In [11]:
# Cargar el dataframe (ya lo tienes cargado como model_df)
# Variables predictoras y objetivo
X = model_df[['costo_gramo', 'peso']]
y = model_df['pvp']

# División de los datos en entrenamiento y test (80% - 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:

# Modelos a usar
linear_model = LinearRegression()
random_forest_model = RandomForestRegressor(random_state=42)
svr_model = SVR()

# Agregamos dos algoritmos adicionales: KNeighbors y DecisionTree
knn_model = KNeighborsRegressor()
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Crear pipeline de preprocesamiento y modelo
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalización de los datos
    ('model', linear_model)        # Placeholder para los modelos
])

In [13]:

# Parámetros para la búsqueda por grid
param_grid = [
    {
        'model': [linear_model]
    },
    {
        'model': [random_forest_model],
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20]
    },
    {
        'model': [svr_model],
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto'],
        'model__kernel': ['rbf', 'linear']
    },
    {
        'model': [knn_model],
        'model__n_neighbors': [3, 5, 7],
        'model__weights': ['uniform', 'distance']
    },
    {
        'model': [decision_tree_model],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10]
    }
]

In [18]:


# Realizar la búsqueda de hiperparámetros usando GridSearchCV con 5 folds
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=4)
grid_search.fit(X_train, y_train)

# Mostrar los mejores parámetros para cada algoritmo
print("Mejores parámetros:", grid_search.best_params_)


Fitting 5 folds for each of 37 candidates, totalling 185 fits


[CV 1/5] END ........model=LinearRegression();, score=-43.270 total time=   0.0s
[CV 2/5] END ........model=LinearRegression();, score=-40.507 total time=   0.0s
[CV 4/5] END .......model=LinearRegression();, score=-206.424 total time=   0.0s
[CV 5/5] END ........model=LinearRegression();, score=-57.683 total time=   0.0s
[CV 3/5] END .......model=LinearRegression();, score=-106.550 total time=   0.0s
[CV 1/5] END model=RandomForestRegressor(random_state=42), model__max_depth=None, model__n_estimators=50;, score=-26.861 total time=   0.2s
[CV 2/5] END model=RandomForestRegressor(random_state=42), model__max_depth=None, model__n_estimators=50;, score=-27.185 total time=   0.2s
[CV 3/5] END model=RandomForestRegressor(random_state=42), model__max_depth=None, model__n_estimators=50;, score=-76.146 total time=   0.2s
[CV 4/5] END model=RandomForestRegressor(random_state=42), model__max_depth=None, model__n_estimators=50;, score=-15.568 total time=   0.2s
[CV 5/5] END model=RandomForestRegr

In [19]:
# Hacer predicciones usando el mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calcular métricas
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2 Score: {r2}")

MAE: 2.9445428598544057
MSE: 28.93416475825915
R2 Score: 0.9176406975167339


In [27]:
model_df["pvp_pred"]= best_model.predict(X)
model_df["error"] = model_df.pvp-model_df.pvp_pred
model_df["error_cuadratico"]= model_df.error*model_df.error

In [28]:
model_df

Unnamed: 0,pvp,costo_gramo,peso,pvp_pred,error,error_cuadratico
0,18.7,2.4,2.3,16.786783,1.913217,3.660401
1,18.7,2.4,2.3,16.786783,1.913217,3.660401
2,18.7,2.4,2.3,16.786783,1.913217,3.660401
3,17.8,2.4,1.4,15.528930,2.271070,5.157758
4,17.8,2.4,1.4,15.528930,2.271070,5.157758
...,...,...,...,...,...,...
2869,47.6,2.4,7.9,45.975495,1.624505,2.639016
2870,28.6,2.6,4.4,33.159223,-4.559223,20.786515
2871,37.8,2.6,5.7,37.336705,0.463295,0.214642
2872,44.5,2.6,6.7,40.117493,4.382507,19.206363


In [29]:
model_df.error_cuadratico.mean()

np.float64(13.795216512113619)

In [31]:
mean_squared_error(model_df.pvp, model_df.pvp_pred)


np.float64(13.795216512113619)