In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import itertools
import os


data = pd.read_csv('bds/dataset_demand_acumulate.csv', parse_dates=['year_month'])
data['year_month'] = pd.to_datetime(data['year_month'])
data.set_index('year_month', inplace=True)  # Establecer 'year_month' como índice

print(data.head())

# Graficar la demanda a lo largo del tiempo
plt.figure(figsize=(12, 6))
plt.plot(data.index, data['Demand'], label='Demanda', color='b')
plt.title('Demanda acumulada de Cementos Argos (2017-2022)')
plt.xlabel('Fecha')
plt.ylabel('Demanda')

# Configurar el formato del eje X
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()  # Ajustar el diseño para que no se corte el contenido
plt.show()

In [None]:

train_size = int(len(data) * 0.7)
train_data, validation_data = data.iloc[:train_size], data.iloc[train_size:]

# Concatenar los datos de entrenamiento y validación para graficar
combined_data = pd.concat([train_data, validation_data], axis=0)

# Graficar los datos
plt.figure(figsize=(12, 6))

# Graficar todos los datos en una sola línea continua
plt.plot(combined_data.index, combined_data['Demand'], label='Demanda', color='b')

# Graficar entrenamiento y validación con diferentes estilos

plt.plot(validation_data.index, validation_data['Demand'], label='Validación', color='r', linestyle='--')

# Configurar el formato del eje X
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

# Etiquetas y título
plt.title('División de Datos 70-30')
plt.xlabel('Fecha')
plt.ylabel('Test')
plt.legend()

# Ajustes adicionales
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()  # Ajustar el diseño para que no se corte el contenido

# Mostrar gráfico
plt.show()

In [None]:
# Entrenar modelo
model = ExponentialSmoothing(train_data['Demand'], seasonal='add', seasonal_periods=12).fit()

In [None]:
# Realizar predicciones en el conjunto de validación
validation_predictions = model.predict(start=validation_data.index[0], end=validation_data.index[-1])

# Calcular métricas de evaluación
mse = mean_squared_error(validation_data['Demand'], validation_predictions)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

# Graficar predicciones en el conjunto de validación
plt.figure(figsize=(12, 6))
plt.plot(combined_data.index, combined_data['Demand'], label='Entrenamiento')
plt.plot(validation_data.index, validation_data['Demand'], label='Validación')
plt.plot(validation_data.index, validation_predictions, label='Predicciones de Validación', linestyle='--')
plt.title('Evaluación del Modelo')
plt.xlabel('Fecha')
plt.ylabel('Demanda')
plt.legend()
# Configurar el formato del eje X
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()  # Ajustar el diseño para que no se corte el contenido
plt.show()


In [None]:
# Realizar pronósticos para mayo, junio y julio de 2022
model = ExponentialSmoothing(data['Demand'], seasonal='add', seasonal_periods=12).fit()
forecast = model.predict(start='2022-05-01', end='2022-07-01')
print(forecast)

In [None]:
forecast_df = forecast.reset_index()
forecast_df.columns = ['year_month', 'Demand']

# Convertir la columna 'Fecha' a tipo datetime
forecast_df['year_month'] = pd.to_datetime(forecast_df['year_month'])
combined_data = combined_data.reset_index()
df_last = pd.concat([combined_data, forecast_df], axis=0).reset_index(drop=True)

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(combined_data.year_month, combined_data['Demand'], label='Data')
plt.plot(validation_data.index, validation_data['Demand'], label='Validación')
plt.plot(validation_data.index, validation_predictions, label='Predicciones de Validación', linestyle='--')
plt.plot(forecast.index, forecast, label='Pronósticos', linestyle='--')
plt.title('Pronósticos de Demanda (Mayo - Julio 2022)')
plt.xlabel('Fecha')
plt.ylabel('Demanda')
plt.legend()
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Optimizar modelo

def tes_optimizer(train_data, validation_data, abg, trend_mode='add', seasonal_mode='add', seasonal_period=12, step=20):

    best_alpha, best_beta, best_gamma, best_rmse = None, None, None, float("inf")
    
    for comb in abg:
        tes_model = ExponentialSmoothing(train_data, trend=trend_mode, seasonal=seasonal_mode, seasonal_periods=seasonal_period).fit(
            smoothing_level=comb[0], smoothing_trend=comb[1], smoothing_seasonal=comb[2]
        )
        y_pred = tes_model.forecast(step)
        mse = mean_squared_error(validation_data, y_pred)
        rmse = mse ** 0.5
        
        if rmse < best_rmse:
            best_alpha, best_beta, best_gamma, best_rmse = comb[0], comb[1], comb[2], rmse
            
        print([round(comb[0], 2), round(comb[1], 2), round(comb[2], 2), round(rmse, 2)])
        print("best_alpha:", round(best_alpha, 2), "best_beta:", round(best_beta, 2), "best_gamma:", round(best_gamma, 2),
              "best_mae:", round(best_rmse, 4))
        
    return best_alpha, best_beta, best_gamma, best_rmse


In [None]:
def plot_co2(train, test, y_pred, title):

    mse = mean_squared_error(test, y_pred)
    rmse = mse ** 0.5
    print(f'RMSE: {rmse}')
    train["2015":].plot(legend=True, label="TRAIN", title=f"{title}, RMSE: {round(rmse,2)}")
    test.plot(legend=True, label="TEST", figsize=(12, 6))
    y_pred.plot(legend=True, label="PREDICTION")
    plt.grid(True)
    plt.show()

In [None]:
alphas = betas = gammas = np.arange(0.10, 1, 0.20)
abg = list(itertools.product(alphas, betas, gammas))

In [None]:
model = ExponentialSmoothing(train_data['Demand'],
   trend="add",
   seasonal="add",
   seasonal_periods=12
   ).fit(smoothing_level=0.5,
         smoothing_trend=0.5,
         smoothing_seasonal=0.5
   )

In [None]:
y_pred = model.forecast(20)

In [None]:
plot_co2(train_data['Demand'], validation_data['Demand'], y_pred, "Forecast")

In [None]:
best_alpha, best_beta, best_gamma, best_mae = tes_optimizer(train_data['Demand'],validation_data['Demand'], abg)

In [None]:
final_tes_model = ExponentialSmoothing(train_data['Demand'], trend="add", seasonal="add", seasonal_periods=12).fit(smoothing_level=best_alpha, smoothing_trend=best_beta, smoothing_seasonal=best_gamma)

In [None]:
y_pred = final_tes_model.forecast(20)

In [None]:
plot_co2(train_data['Demand'], validation_data['Demand'], y_pred, "Triple Exponential Smoothing")

In [None]:
final_tes_model = ExponentialSmoothing(data['Demand'], trend="add", seasonal="add", seasonal_periods=12).fit(smoothing_level=best_alpha, smoothing_trend=best_beta, smoothing_seasonal=best_gamma)

In [None]:
forecast = final_tes_model.predict(start='2022-05-01', end='2022-07-01')
forecast

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(combined_data.year_month, combined_data['Demand'], label='Data')
plt.plot(validation_data.index, validation_data['Demand'], label='Validación')
plt.plot(validation_data.index, y_pred, label='Predicciones de Validación', linestyle='--')
plt.plot(forecast.index, forecast, label='Pronósticos', linestyle='--')
plt.title('Pronósticos de Demanda (Mayo - Julio 2022)')
plt.xlabel('Fecha')
plt.ylabel('Demanda')
plt.legend()
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Punto 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib


# 1. Carga y Exploración de Datos
data = pd.read_csv('bds/dataset_alpha_betha.csv')



In [None]:
data.dtypes

In [None]:
data.isna().sum()

In [None]:
data['Demand'] = pd.to_numeric(data['Demand'], errors='coerce')

In [None]:
data.isna().sum()

In [None]:
data[data['Demand'].isna() ==True]

In [None]:
data.dropna(inplace=True)

In [None]:
# Separar características y etiquetas
X = data.drop('Class', axis=1)  # Asegúrate de que 'class' es el nombre de la columna de etiquetas
y = data['Class']

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Crear un preprocesador para las columnas categóricas y numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ])

# Crear un pipeline con preprocesamiento, TruncatedSVD y el modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svd', TruncatedSVD(n_components=10)),  # Ajusta el número de componentes según sea necesario
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
# 3. División de los Datos en Conjuntos de Entrenamiento y Testeo (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Entrenamiento del Modelo de Clasificación
pipeline.fit(X_train, y_train)

In [None]:
# 5. Evaluación del Modelo
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)



In [None]:
# Graficar Matriz de Confusión
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xticks(np.arange(len(pipeline.named_steps['classifier'].classes_)), pipeline.named_steps['classifier'].classes_, rotation=45)
plt.yticks(np.arange(len(pipeline.named_steps['classifier'].classes_)), pipeline.named_steps['classifier'].classes_)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()


In [None]:
# Guardar el pipeline entrenado (incluye preprocesamiento, TruncatedSVD y modelo)
joblib.dump(pipeline, 'api_project/model/classification_pipeline.pkl')

# Guardar las métricas en un archivo txt
with open('results/model_metrics.txt', 'w') as f:
    f.write(f'Accuracy: {accuracy}\n')
    f.write('Confusion Matrix:\n')
    f.write(np.array2string(conf_matrix))
    f.write('\nClassification Report:\n')
    f.write(class_report)

In [None]:
# Información adicional
with open('results/model_info.txt', 'w') as f:
    f.write('Modelo: RandomForestClassifier\n')
    f.write(f'Número de datos de entrenamiento: {len(X_train)}\n')
    f.write(f'Número de datos de testeo: {len(X_test)}\n')
    f.write(f'Número de estimadores: {pipeline.named_steps["classifier"].n_estimators}\n')

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
import joblib

# Cargar el modelo entrenado
model = joblib.load('api_project/model/classification_pipeline.pkl')

# Cargar el dataset
df_to_predict = pd.read_csv('bds/to_predict.csv')




In [None]:
forecast_df = forecast.reset_index()
forecast_df.columns = ['year_month', 'Demand']

In [None]:
df_to_predict

In [None]:
df_to_predict['Demand'] = forecast_df['Demand']

y = df_to_predict['Class']
df_to_predict.drop(['Class'],axis=1, inplace=True)

# Crear la carpeta api_project si no existe
output_dir = 'api_project'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Exportar el DataFrame a JSON en la carpeta api_project
json_file_path = os.path.join(output_dir, 'input_data.json')

df_to_predict.to_json(json_file_path, orient='records', lines=False, indent=4)

print(f"Archivo JSON guardado en: {json_file_path}")

In [None]:
# Realizar la predicción de la demanda
predictions = pipeline.predict(df_to_predict)

# Añadir las predicciones al DataFrame
df_to_predict['Class'] = predictions

# Guardar el DataFrame actualizado
df_to_predict.to_csv('results/to_predict_with_demand_and_class.csv', index=False)