# Librerías

In [None]:
import sys
import os
sys.path.append(".")
import re
import pandas as pd
import numpy as np
import pandasql as ps
from datetime import timedelta
from matplotlib.ticker import FuncFormatter
import matplotlib.pyplot as plt
import seaborn as sns

color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', None)
def thousands_formatter_func(x, pos):
    return f'{int(x / 1e3)}K'
thousand_formatter = FuncFormatter(thousands_formatter_func)
def decimal_percentage_formatter_func(x, pos):
    return f'{int(x * 100)}%'
decimal_percentage_formatter = FuncFormatter(decimal_percentage_formatter_func)
def percentage_formatter_func(x, pos):
    return f'{int(x)}%'
percentage_formatter = FuncFormatter(percentage_formatter_func)

# Lectura archivos

In [None]:
df_bombs = pd.read_csv('data/processed_csv/df_bombs.csv', delimiter=',', encoding='latin-1', index_col=0)
df_of = pd.read_csv('data/processed_csv/df_of.csv', delimiter=',', encoding='latin-1')
df_operators = pd.read_csv('data/processed_csv/df_operators.csv', delimiter=',', encoding='latin-1')
df_operators_participation = pd.read_csv('data/processed_csv/df_operators_participation.csv', delimiter=',', encoding='latin-1')

df_bombs['start_date'] = pd.to_datetime(df_bombs['start_date'], format='%Y-%m-%d %H:%M:%S.%f')
df_bombs['end_date'] = pd.to_datetime(df_bombs['end_date'], format='%Y-%m-%d %H:%M:%S.%f')
df_of['start_date'] = pd.to_datetime(df_of['start_date'], format='%Y-%m-%d %H:%M:%S.%f')
df_of['end_date'] = pd.to_datetime(df_of['end_date'], format='%Y-%m-%d %H:%M:%S.%f')



In [None]:
df_of.line = df_of.line.replace({"LÃ\x8dNEA 2": "LINEA_2", "LINEA_4": "LINEA_4", "LÃ\x8dNEA 1": "LINEA_1", "LINEA_6": "LINEA_6",
                    "LINEA 3": "LINEA_3", "LÃ\x8dNEA KIVU": "LINEA_KIVU", "PREFILTRO L-1": "PREFILTRO_L-1", "PREFILTRO L-6": "PREFILTRO_L-6",
                        "LINEA 7": "LINEA_7", "LINEA 8 IML": "LINEA_8_IML"})
df_operators_participation.line = df_operators_participation.line.replace({"LÃ\x8dNEA 2": "LINEA_2", "LINEA_4": "LINEA_4", "LÃ\x8dNEA 1": "LINEA_1", "LINEA_6": "LINEA_6",
                    "LINEA 3": "LINEA_3", "LÃ\x8dNEA KIVU": "LINEA_KIVU", "PREFILTRO L-1": "PREFILTRO_L-1", "PREFILTRO L-6": "PREFILTRO_L-6",
                        "LINEA 7": "LINEA_7", "LINEA 8 IML": "LINEA_8_IML"})

df_of.line.unique()

In [None]:
def remove_special_chars(text):
    # Agrega los caracteres especiales que deseas eliminar, incluyendo los tildes
    special_chars = r"[^\w\sáéíóúÁÉÍÓÚñÑÃ]"
    text_without_special_chars = re.sub(special_chars, '', text)
    # Agrega aquí cualquier otro reemplazo adicional que desees realizar
    return text_without_special_chars

In [None]:
df_of['line'].apply(remove_special_chars)

In [None]:
df_of["weekday"] = df_of["start_date"].dt.weekday.astype("category")
df_of["turn"] = df_of["start_date"].apply(lambda x: 'AM' if x.hour < 14 else 'PM')
df_of["month"] = df_of["start_date"].dt.month.astype("category")
df_of["year"] = df_of["start_date"].dt.year.astype("category")


In [None]:
df_of.head()

# Distribución Performance

In [None]:
df_of.head()

In [None]:
data = df_of.sort_values(by='performance', ascending=False)

plt.figure(figsize=(10, 5))
sns.histplot(data=data, x='performance', bins=20, kde=True)
plt.xlabel('Performance')
plt.ylabel('Frecuencia')
plt.title(f'Distribución de la performance de OFs')
plt.xticks(rotation=70, ha='right', fontsize=8)
plt.gca().legend().set_visible(True)
plt.gca().xaxis.set_major_formatter(decimal_percentage_formatter_func)
plt.show()

# Distribución de Participación

In [None]:
df_operators_participation.head()

In [None]:
df_of.describe()

In [None]:
df_operators_participation.describe()

In [None]:
print(df_operators_participation.participation_percentage.quantile(0.02))
print(df_operators_participation.participation_percentage.quantile(0.05))
print(df_operators_participation.participation_percentage.quantile(0.10))

In [None]:
data = df_operators_participation.sort_values(by='participation_percentage', ascending=False)

plt.figure(figsize=(10, 5))
sns.histplot(data=data, x='participation_percentage', bins=40, kde=True)
plt.xlabel('Performance')
plt.ylabel('Frecuencia')
plt.title(f'Distribución de la performance de OFs')
plt.xticks(rotation=70, ha='right', fontsize=8)
plt.gca().xaxis.set_major_formatter(percentage_formatter)
plt.gca().legend().set_visible(True)
plt.show()

In [None]:
data = df_operators_participation[
    df_operators_participation.participation_percentage <= 40
].sort_values(by='participation_percentage', ascending=False)

plt.figure(figsize=(10, 5))
sns.histplot(data=data, x='participation_percentage', bins=20, kde=True)
plt.xlabel('Performance')
plt.ylabel('Frecuencia')
plt.title(f'Distribución de la performance de OFs')
plt.xticks(rotation=70, ha='right', fontsize=8)
plt.gca().xaxis.set_major_formatter(percentage_formatter)
plt.gca().legend().set_visible(True)
plt.show()

In [None]:
df_operators_participation.describe()

# Preparación Modelo

## Supuestos

Supuestos
* Se asume un corte de performance aceptable para una OF sobre el  80%
* Se asume una participación mínima deun 5% del tiempo de un operador en una OF para considerar que trabajó en ella
* Para entrenar, se asume que la cantidad de bombas realizadas (good_qty) era lo planificado a hacer para la OF
* Se quita del análisis líneas que no son de interés para el cliente (LINEA 8 IML)

In [None]:
df_of_model = df_of.copy()
df_operators_participation_model = df_operators_participation.copy()
df_operators_model = df_operators.copy()

perfomance_cutoff = 0.88
participation_cutoff = 0.05
exclude_lines= ['LINEA_8_IML']

In [None]:
df_of_model.drop(['operators_distinct_ids', 'plan_qty', 'theorical_qty_round', 'performance_round', 'total_good_qty', 'total_bad_qty'], axis=1, inplace=True)
df_of_model = df_of_model[df_of_model.line.isin(exclude_lines) == False]
df_operators_participation_model = df_operators_participation_model[df_operators_participation_model.line.isin(exclude_lines) == False]  
print(df_of_model.describe())
df_of_model.head()

Removemos las OFs que no tenían cantidad palnificada *good_qty == 0*

In [None]:
df_of_model = df_of_model[df_of_model.good_qty != 0]

## Etiquetado de OFs

In [None]:
df_of_model['achieve_performance'] = df_of_model['performance'].apply(lambda x: 1 if x >= perfomance_cutoff else 0)
df_of_model.head()

## Etiquetado participacion

In [None]:
df_operators_participation_model['participation_indicator'] = df_operators_participation_model['participation_percentage'].apply(lambda x: 1 if x >= participation_cutoff*100 else 0)
df_operators_participation_model.head()

In [None]:
# Pivotear el dataframe df_operators_participation para obtener las columnas de operadores
df_operators_participation_model.drop_duplicates(inplace=True)
df_pivot = df_operators_participation_model.pivot(index='order', columns='operator_id', values='participation_indicator')
df_pivot = df_pivot.fillna(0)
df_pivot.columns = ['operator_' + str(col) for col in df_pivot.columns]
df_pivot = df_pivot.astype(int)
print(df_pivot.shape)
df_pivot.head()

## Agregamos experiencia de operadores como dato

In [None]:
df_operators_participation_model = df_operators_participation_model.sort_values(by=['operator_id', 'production_date'], ascending=True)
df_operators_participation_model['accumulated_experience'] = 0
for index, row in df_operators_participation_model.iterrows():
    operator_id = row['operator_id']
    accum_exp = df_operators_participation_model[
        (df_operators_participation_model['operator_id'] == operator_id) &
        (df_operators_participation_model['production_date'] < row['production_date'])
    ]['participation_minutes'].sum()
    df_operators_participation_model.at[index, 'accumulated_experience'] = accum_exp

In [None]:
df_pivot['days_accumulated_experience'] = 0
df_pivot['OFs_accumulated_experience'] = 0


for index, row in df_pivot.iterrows():
    participated_operators = row.index[row == 1].tolist()
    participated_operators= [int(operator.split('_')[1]) for operator in participated_operators]
    df_op_participation = df_operators_participation_model[
        (df_operators_participation_model['operator_id'].isin(participated_operators)) &
        (df_operators_participation_model['participation_indicator'] == 1)
    ]
    hours_exp_sum = df_op_participation.accumulated_experience.sum() / (60*24) # en días
    orders_exp_sum = df_op_participation.accumulated_experience.count() # numero de participaciones OFs
    df_pivot.at[index, 'days_accumulated_experience'] = int(hours_exp_sum)
    df_pivot.at[index, 'OFs_accumulated_experience'] = int(orders_exp_sum)

df_pivot.head()

In [None]:
df_merged = pd.merge(df_of_model, df_pivot, on='order', how='left')
df_merged.head()

In [None]:
cols_operators = [col for col in df_merged.columns if 'operator_' in col]  # selecciona las columnas de operador
df_merged['effective_operators_qty'] = df_merged[cols_operators].apply(lambda row: row.sum(), axis=1)
df_merged.head()

In [None]:
# REVISAR CASO DE OPERADOR-ORDER duplicado
#df_operators_participation_model[(df_operators_participation_model.order == 5309158) & (df_operators_participation_model.operator_id == 1)]

# Pruebas modelos

## XGBoost Classifier

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = df_merged.copy()

TARGET = 'achieve_performance'
exclude_columns = ['order', 'bomb_type', 'registers_qty', 'operators_distinct_qty', 'start_date', 'end_date', 'bad_qty', 'theorical_diff', 'total_operators_minutes', 'theorical_qty',
                   'time_diff_seconds_calculated', 'time_diff_minutes_calculated',
                   'time_diff_hours_calculated', 'performance', 'performance_category'] + [TARGET]
FEATURES = [col for col in df.columns if col not in exclude_columns]

In [None]:
# Dividir los datos en características (features) y variable objetivo (target)
X = df[FEATURES]
y = df[TARGET]

In [None]:
X_encoded = pd.get_dummies(X)
X_encoded

In [None]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
# Crear el clasificador XGBoost
classifier_model = xgb.XGBClassifier(random_state=42)

# Entrenar el modelo
classifier_model.fit(X_train, y_train, verbose=True)

# Realizar predicciones en el conjunto de prueba
y_pred = classifier_model.predict(X_test)
y_pred_proba = classifier_model.predict_proba(X_test)[:, 1]

result = X_test.copy()
result['prediction'] = y_pred
result['prediction_prob'] = y_pred_proba

# Calcular la precisión del modelo
accuracy = accuracy_score(y_test, y_pred)
print("Precisión del modelo: %.2f%%" % (accuracy * 100.0))

In [None]:
save = True
if save:
    with open('classifier_model.pickle', 'wb') as file:
        pickle.dump(classifier_model, file)

In [None]:
df_result_complete = pd.concat([result, df_merged[df_merged.index.isin(result.index)].loc[:, ~df_merged.columns.isin(result.columns)]], axis=1)
df_result_complete.head()

In [None]:
df_of[df_of.order == 5294563]   

In [None]:
# df_result_complete[df_result_complete.achieve_performance != df_result_complete.prediction]
df_result_complete[(df_result_complete.achieve_performance == 1) & (df_result_complete.prediction == 0)]

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score

# Realizar validación cruzada con 5 divisiones
scores = cross_val_score(classifier_model, X_encoded, y, cv=5)

# Mostrar las puntuaciones de rendimiento en cada división
print("Puntuaciones de rendimiento en cada división:", scores)

# Calcular la media y desviación estándar de las puntuaciones de rendimiento
mean_score = np.mean(scores)
std_score = np.std(scores)
print("Media de las puntuaciones de rendimiento:", mean_score)
print("Desviación estándar de las puntuaciones de rendimiento:", std_score)

In [None]:
import matplotlib.pyplot as plt

# Obtener la importancia de las características
importances = classifier_model.feature_importances_

# Obtener los nombres de las características
feature_names = X_encoded.columns

# Ordenar las importancias y los nombres de las características en orden descendente
indices = np.argsort(importances)[::-1]
sorted_importances = importances[indices][:20]
sorted_feature_names = feature_names[indices][:20]

# Visualizar la importancia de las características en un gráfico de barras
sns.set_style('white')
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_importances)), sorted_importances)
plt.xticks(range(len(sorted_importances)), sorted_feature_names, rotation='vertical')
plt.xlabel('Características')
plt.ylabel('Importancia')
plt.title('Importancia de las características')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Seleccionar las columnas relevantes para el análisis
columns = ['line_LINEA_3', 'line_LINEA_1', 'line_LINEA_2', 'good_qty', 'theorical_time', 'operator_9266', 'operator_1007', 'operator_504', 'performance']

# Crear un nuevo DataFrame con las columnas seleccionadas
df_corr = df_result_complete[columns]

# Calcular la matriz de correlación
correlation_matrix = df_corr.corr()

# Imprimir la matriz de correlación
print(correlation_matrix)

Importance Permutatios

In [None]:
from sklearn.inspection import permutation_importance

# Calcular la importancia de las permutaciones
result = permutation_importance(classifier_model, X_test, y_test, n_repeats=10, random_state=42)

# Obtener los puntajes de importancia
importance_scores = result.importances_mean

In [None]:
df_importance_perm = pd.DataFrame({'feature': X_encoded.columns, 'importance': result.importances_mean})

In [None]:
data = df_importance_perm[df_importance_perm.importance != 0.00]
data = df_importance_perm

In [None]:
feature_names = data.feature
importance_scores = data.importance

# Crear un gráfico de barras de la importancia de las características
plt.figure(figsize=(10, 25))
plt.barh(feature_names, importance_scores)
plt.xlabel('Importancia')
plt.ylabel('Característica')
plt.title('Importancia de las características (Permutation Importance)')
plt.tight_layout()
plt.show()

Matriz de confusión

In [None]:
y_true = df_result_complete['achieve_performance']
y_pred = df_result_complete['prediction']

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Calcular la matriz de confusión
cm = confusion_matrix(y_true, y_pred)

# Crear el mapa de calor de la matriz de confusión
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

Curva ROC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Calcular las probabilidades de las clases positivas
y_prob = model.predict_proba(X_test)[:, 1]

# Calcular la tasa de falsos positivos, la tasa de verdaderos positivos y los umbrales
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calcular el área bajo la curva ROC
auc = roc_auc_score(y_test, y_prob)

# Graficar la curva ROC
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend()
plt.show()


Curva precisión Recal

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# Calcular la precisión y la exhaustividad
precision, recall, _ = precision_recall_curve(y_test, y_prob)

# Graficar la curva de Precisión-Recall
plt.plot(recall, precision, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

## XGBoost Regressor | Minutos-hombre 

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
df = df_merged.copy()

TARGET = 'total_operators_minutes'
exclude_columns = ['order', 'bomb_type', 'start_date', 'end_date', 'theorical_qty', 'theorical_diff',
                   'time_diff_seconds_calculated', 'time_diff_minutes_calculated', 'performance',
                   'time_diff_hours_calculated', 'achieve_performance', 'performance_category'] + [TARGET]
FEATURES = [col for col in df.columns if col not in exclude_columns]

In [None]:
# Dividir los datos en características (features) y variable objetivo (target)
X = df[FEATURES]
y = df[TARGET]

In [None]:
X_encoded = pd.get_dummies(X)
X_encoded

In [None]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
# Crear el clasificador XGBoost
model = xgb.XGBRegressor(random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train, verbose=1)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

result = X_test.copy()
result['prediction'] = y_pred



In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)



print("R-squared [R2]: %.2f" % r2)
print("Mean Absolute Error [MAE]: %.2f" % mae)
print("Mean Squared Error [MSE]: %.2f" % mse)
print("Root Mean Squared Error [RMSE]: %.2f" % rmse)

Creamos la variable 'predicted_performance' para comoparar la performance real versus la calculada con la predicción

In [None]:
result['predicted_performance'] = (result.good_qty*result.theorical_time) / result.prediction
result.head()   

Join de results with the OFs dataframe

In [None]:
df_result_complete = pd.concat([result, df_merged[df_merged.index.isin(result.index)].loc[:, ~df_merged.columns.isin(result.columns)]], axis=1)
df_result_complete.head()

Precisión del modelo evaluando performance

In [None]:
r2 = r2_score(df_result_complete.performance, df_result_complete.predicted_performance)
mae = mean_absolute_error(df_result_complete.performance, df_result_complete.predicted_performance)
mse = mean_squared_error(df_result_complete.performance, df_result_complete.predicted_performance)
rmse = np.sqrt(mse)



print("R-squared [R2]: %.2f" % r2)
print("Mean Absolute Error [MAE]: %.2f" % mae)
print("Mean Squared Error [MSE]: %.2f" % mse)
print("Root Mean Squared Error [RMSE]: %.2f" % rmse)

Validación cruzada

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score

# Realizar validación cruzada con 5 divisiones
scores = cross_val_score(model, X_encoded, y, cv=5)

# Mostrar las puntuaciones de rendimiento en cada división
print("Puntuaciones de rendimiento en cada división:", scores)

# Calcular la media y desviación estándar de las puntuaciones de rendimiento
mean_score = np.mean(scores)
std_score = np.std(scores)
print("Media de las puntuaciones de rendimiento:", mean_score)
print("Desviación estándar de las puntuaciones de rendimiento:", std_score)

In [None]:
comparison = pd.DataFrame({'TARGET': y_test, 'Predict': y_pred})
print(comparison)


In [None]:
import matplotlib.pyplot as plt

# Obtener la importancia de las características
importances = model.feature_importances_

# Obtener los nombres de las características
feature_names = X_encoded.columns

# Ordenar las importancias y los nombres de las características en orden descendente
indices = np.argsort(importances)[::-1]
sorted_importances = importances[indices][:30]
sorted_feature_names = feature_names[indices][:30]

# Visualizar la importancia de las características en un gráfico de barras
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_importances)), sorted_importances)
plt.xticks(range(len(sorted_importances)), sorted_feature_names, rotation='vertical')
plt.xlabel('Características')
plt.ylabel('Importancia')
plt.title('Importancia de las características')
plt.tight_layout()
plt.show()

In [None]:
feature_name = 'theorical_qty'
# Graficar la relación entre la característica y las predicciones
plt.figure(figsize=(10, 6))
plt.plot(df_result_complete[feature_name], y_pred)
plt.xlabel(feature_name)
plt.ylabel('Predicciones')
plt.title('Relación entre {} y las predicciones'.format(feature_name))
plt.show()

In [None]:
data = df_result_complete.sort_values(by='total_operators_minutes', ascending=False)

plt.figure(figsize=(10, 5))
sns.histplot(data=data, x='total_operators_minutes', kde=True, label='Real Minutos-hombre')
sns.histplot(data=data, x='prediction', kde=True, label='Predicción Minutos-hombre')
plt.xlabel('Minutos-hombre')
plt.ylabel('Frecuencia')
plt.title(f'Comparación entre Real y Predicción de Minutos-hombre')
plt.xticks(rotation=70, ha='right', fontsize=8)
plt.gca().legend().set_visible(True)
plt.show()

## XGBoost Regressor | Performance

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
df = df_merged.copy()

TARGET = 'performance'
exclude_columns = ['order', 'bomb_type', 'start_date', 'end_date', 'theorical_qty', 'theorical_diff',
                   'time_diff_seconds_calculated', 'time_diff_minutes_calculated', 'total_operators_minutes',
                   'time_diff_hours_calculated', 'achieve_performance', 'performance_category'] + [TARGET]
FEATURES = [col for col in df.columns if col not in exclude_columns]

In [None]:
# Dividir los datos en características (features) y variable objetivo (target)
X = df[FEATURES]
y = df[TARGET]

In [None]:
X_encoded = pd.get_dummies(X)
X_encoded

In [None]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
# Crear el clasificador XGBoost
model = xgb.XGBRegressor(random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train, verbose=1)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)

result = X_test.copy()
result['prediction'] = y_pred



In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)



print("R-squared [R2]: %.2f" % r2)
print("Mean Absolute Error [MAE]: %.2f" % mae)
print("Mean Squared Error [MSE]: %.2f" % mse)
print("Root Mean Squared Error [RMSE]: %.2f" % rmse)

Join de results with the OFs dataframe

In [None]:
df_result_complete = pd.concat([result, df_merged[df_merged.index.isin(result.index)].loc[:, ~df_merged.columns.isin(result.columns)]], axis=1)
df_result_complete.head()

Validación cruzada

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score

# Realizar validación cruzada con 5 divisiones
scores = cross_val_score(model, X_encoded, y, cv=5)

# Mostrar las puntuaciones de rendimiento en cada división
print("Puntuaciones de rendimiento en cada división:", scores)

# Calcular la media y desviación estándar de las puntuaciones de rendimiento
mean_score = np.mean(scores)
std_score = np.std(scores)
print("Media de las puntuaciones de rendimiento:", mean_score)
print("Desviación estándar de las puntuaciones de rendimiento:", std_score)

In [None]:
comparison = pd.DataFrame({'TARGET': y_test, 'Predict': y_pred})
print(comparison)


In [None]:
import matplotlib.pyplot as plt

# Obtener la importancia de las características
importances = model.feature_importances_

# Obtener los nombres de las características
feature_names = X_encoded.columns

# Ordenar las importancias y los nombres de las características en orden descendente
indices = np.argsort(importances)[::-1]
sorted_importances = importances[indices][:30]
sorted_feature_names = feature_names[indices][:30]

# Visualizar la importancia de las características en un gráfico de barras
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_importances)), sorted_importances)
plt.xticks(range(len(sorted_importances)), sorted_feature_names, rotation='vertical')
plt.xlabel('Características')
plt.ylabel('Importancia')
plt.title('Importancia de las características')
plt.tight_layout()
plt.show()

In [None]:
feature_name = 'theorical_qty'
# Graficar la relación entre la característica y las predicciones
plt.figure(figsize=(10, 6))
plt.plot(df_result_complete[feature_name], y_pred)
plt.xlabel(feature_name)
plt.ylabel('Predicciones')
plt.title('Relación entre {} y las predicciones'.format(feature_name))
plt.show()

In [None]:
data = df_result_complete.sort_values(by='performance', ascending=False)

plt.figure(figsize=(10, 5))
sns.histplot(data=data, x='performance', kde=True, label='Real Performance')
sns.histplot(data=data, x='prediction', kde=True, label='Predicción Performance')
plt.xlabel('Minutos-hombre')
plt.ylabel('Frecuencia')
plt.title(f'Comparación entre Real y Predicción de Performance')
plt.xticks(rotation=70, ha='right', fontsize=8)
plt.gca().legend().set_visible(True)
plt.show()

# Algoritmo

In [None]:
import itertools
import numpy as np
import pandas as pd

def select_best_combo(orders, available_operators, available_lines, max_operators_per_order, model):
    """
    Itera sobre todas las combinaciones posibles de operadores y lineas y selecciona la que tiene
    la mejor predicción para todas las OFs, con la restricción de que un operador no puede trabajar
    en más de una OF a la vez.
    """
    best_prediction = np.inf  # inicializar con infinito para minimizar
    best_order_operator_line = {}

    # Creamos todas las combinaciones de operadores y líneas
    operator_combos = []
    for r in range(1, max_operators_per_order + 1):
        operator_combos.extend(itertools.combinations(available_operators, r))
    operator_line_combos = list(itertools.product(operator_combos, available_lines))

    # Recorremos todas las órdenes
    for order in orders:
        order_best_prediction = np.inf
        order_best_operator_line = None

        # Probamos cada combinación de operador-línea para la orden
        for operators, line in operator_line_combos:
            # Creamos un dataframe con las características de la orden
            order_df = pd.DataFrame({
                'good_qty': [order['good_qty']],
                'theorical_time': [order['theorical_time']],
                'operators': [operators],
                'line': [line],
            })

            # Hacemos la predicción con el modelo
            prediction = model.predict(order_df)[0]

            # Si la predicción es la mejor hasta ahora para esta orden, la guardamos
            if prediction < order_best_prediction:
                order_best_prediction = prediction
                order_best_operator_line = (operators, line)

        # Almacenamos la mejor combinación operador-línea para esta orden
        best_order_operator_line[order['id']] = order_best_operator_line

        # Actualizamos la mejor predicción general
        if order_best_prediction < best_prediction:
            best_prediction = order_best_prediction

        # Eliminamos la combinación de operador-línea seleccionada de la lista de combinaciones
        operator_line_combos = [combo for combo in operator_line_combos if not set(order_best_operator_line[0]).issubset(set(combo[0]))]

    return best_order_operator_line, best_prediction


In [None]:
df_operators_participation_model.head()

In [None]:
df_operators_participation_model = df_operators_participation_model.sort_values(by=['operator_id', 'production_date'], ascending=True)

In [None]:
df_operators_participation_model['accumulated_experience'] = 0.0

In [None]:
for index, row in df_operators_participation_model.iterrows():
    operador_actual = row['operator_id']
    fechas_anteriores = df_operators_participation_model.loc[:index, 'production_date']
    accumulated_experience = df_operators_participation_model[(df_operators_participation_model['operator_id'] == operador_actual) & (df_operators_participation_model['production_date'] < row['production_date'])]['participation_minutes'].sum()
    df_operators_participation_model.at[index, 'accumulated_experience'] = accumulated_experience


In [None]:
df_operators_participation_model.head()

In [None]:
# Pivotear el dataframe df_operators_participation para obtener las columnas de operadores
df_operators_participation_model.drop_duplicates(inplace=True)
df_pivot = df_operators_participation_model.pivot(index='order', columns='operator_id', values='participation_indicator')
df_pivot = df_pivot.fillna(0)
df_pivot.columns = ['operator_' + str(col) for col in df_pivot.columns]
df_pivot = df_pivot.astype(int)
print(df_pivot.shape)
df_pivot.head()

In [None]:
df_pivot['accumulated_experience'] = 0

for index, row in df_pivot.iterrows():
    participated_operators = row.index[row == 1].tolist()
    participated_operators= [int(operator.split('_')[1]) for operator in participated_operators]
    exp_sum = df_operators_participation_model[
        (df_operators_participation_model['operator_id'].isin(participated_operators)) &
        (df_operators_participation_model['participation_indicator'] == 1)
    ]['accumulated_experience'].sum() / (60*24) # en días
    df_pivot.at[index, 'accumulated_experience'] = int(exp_sum)

df_pivot.head()