In [17]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import SGDRegressor, LinearRegression, ElasticNet
from sklearn.svm import SVR
from explainerdashboard import RegressionExplainer, ClassifierExplainer, ExplainerDashboard
from math import sqrt

RANDOM_STATE = 123

In [2]:
celonis_august_df = pd.read_excel('data/20230929132544_CELONIS_EXPORT.xlsx')
smt_august_df = pd.read_excel('data/08_I11_SMT_Incidents_of_own_solution_group(s).xlsx')

smt_august_df.columns = smt_august_df.iloc[11].values
smt_august_df = smt_august_df.iloc[12:]
smt_august_df = smt_august_df.reset_index(drop=True)




In [3]:
celonis_september_df = pd.read_excel('data/20231013102035_CELONIS_EXPORT.xlsx')
smt_september_df = pd.read_excel('data/09_I11_SMT_Incidents_of_own_solution_group(s).xlsx')

smt_september_df.columns = smt_september_df.iloc[11].values
smt_september_df = smt_september_df.iloc[12:]
smt_september_df = smt_september_df.reset_index(drop=True)

In [4]:
celonis_df = pd.concat([celonis_august_df, celonis_september_df])
celonis_df.reset_index(drop=True,inplace=True)

smt_df = pd.concat([smt_august_df, smt_september_df])
smt_df.reset_index(drop=True, inplace=True)

In [5]:
print("Longitud de celonis_august_df:", len(celonis_august_df))
print("Longitud de celonis_september_df:", len(celonis_september_df))
print("Longitud de celonis_df:", len(celonis_df), '\n')

print("Longitud de smt_august_df:", len(smt_august_df))
print("Longitud de smt_september_df:", len(smt_september_df))
print("Longitud de smt_df:", len(smt_df))

Longitud de celonis_august_df: 440
Longitud de celonis_september_df: 665
Longitud de celonis_df: 1105 

Longitud de smt_august_df: 254
Longitud de smt_september_df: 407
Longitud de smt_df: 661


In [6]:
celonis_df = celonis_df.drop(['Template Name', 'Requisition ID', 'CI'], axis=1)
celonis_df.dropna(inplace=True)
celonis_df = celonis_df.drop(['Submit Date', 'Last Resolution Date', 'Ticket Summary'], axis=1)

In [7]:
smt_df = smt_df.drop(['Original Incident Number', 'Requisition ID', 'CI+', 'Product Name', 
                      'Resolution Product Name', 'Primary Center Code'], axis=1)
smt_df.dropna(inplace=True)
smt_df = smt_df.drop(['Created Date (UTC+0)', 'Summary', 'Last Modified Date (UTC+0)', 
                      'First Resolved Date (UTC+0)', 'Last Resolved Date (UTC+0)', 'Notes', 'Resolution'], axis=1)

In [8]:
smt_df = smt_df.drop(['Site Group', 'Site', 'Incident Type', 'Reported Source', 'Status'], axis=1)

In [9]:
combined_df = pd.merge(celonis_df, smt_df, how='inner', left_on='Incident Number', right_on='Incident ID')
combined_df.drop(['Service Name', 'Incident ID','In Progess Time (hhh:mi)\n', 'Actual Duration/Open Time (hhh:mi)', 'Customer Department', 'Desk Location', 
                  'Down Time of CI-Unavailability (hhh:mi)', 'Service+', 'Target Date'], axis=1, inplace=True) #Comprobar Target Date

#Merge en el otro sentido
#combined_df = pd.merge(smt_df, celonis_df, how='inner', left_on='Incident ID', right_on='Incident Number')
#combined_df.drop(['Service Name', 'Incident Number'], axis=1, inplace=True)

In [10]:
combined_df.shape

(466, 35)

In [11]:
columns_list = combined_df.columns
columns_list = [elem for elem in columns_list if elem not in ['Incident Number', 'Last Resolution Time', 'Incident ID']] 
combined_df_one_hot = pd.get_dummies(combined_df, columns=columns_list)
combined_df_one_hot

Unnamed: 0,Incident Number,Last Resolution Time,Site Group_ES,Site_Md,Site_Md1,Site_Md3,Incident Type_Infrastructure Restoration,Incident Type_User Service Request,Incident Type_User Service Restoration,Submitter Group_Human Resources - Personnel Administration,...,Resolution Product Categorization Tier 3_Standard Desktop,Resolution Product Categorization Tier 3_Standard Desktop Tiny,Resolution Product Categorization Tier 3_Standard Notebook,Resolution Product Categorization Tier 3_Standard Notebook Touch,Resolution Product Categorization Tier 3_Standard Smartphone,Resolution Product Categorization Tier 3_Standard iPad (WWAN),Resolution Product Categorization Tier 3_Support Service,Resolution Product Categorization Tier 3_Web Application Service,Created by CI-Hotline_No,Created by CI-Hotline_Yes
0,INC000025221092,4.697778,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
1,INC000025221966,4.116667,1,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
2,INC000025223159,23.067500,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
3,INC000025223804,22.799722,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
4,INC000025224626,24.208611,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,INC000025611989,1.248056,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
462,INC000025612320,0.000833,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
463,INC000025612916,0.001111,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
464,INC000025613326,0.001944,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [12]:
important_columns = ['Incident Number', 'Site Group_x', 'Site_x', 'Incident Type_x', 'Submitter Group', 'Last Assigned Organization', 'Last Assigned Group', 'Entry Channel', 'User Company', 'Resolver Organization', 
                     'Reported Source', 'Status_x', '	Priority', 'Incident Type', 'Status_y', 'Status Reason', 'Company', 'Region', 'Site Group_y', 'Site_y', 'Reported Source_y', 'Impact', 'Open Priority',
                     'Current Priority', 'Assigned Group', 'Assigned Group Department', 'Operational Categorization Tier 1', 'Operational Categorization Tier 2', 'Operational Categorization Tier 3', 
                     'Product Categorization Tier 1', 'Product Categorization Tier 2', 'Product Categorization Tier 3', 'Resolution Categorization Tier 1', 'Resolution Categorization Tier 2', 'Resolution Categorization Tier 3',
                     'Resolution Product Categorization Tier 1', 'Resolution Product Categorization Tier 2', 'Resolution Product Categorization Tier 3', 'Created by CI-Hotline']
doubt_columns = ['User Company', 'Status_x', 'Status_y', 'Company', 'Region']
many_values = ['Customer Department', 'Desk Location', 'Service+'] 
#Desk Location en duda, tiene como 40 valores
#Service+ puede ser importante aunque tenga muchos valores, unos 40

In [13]:
X = combined_df_one_hot.drop(['Incident Number', 'Last Resolution Time'], axis=1).values
y = combined_df_one_hot['Last Resolution Time'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [18]:
gbr = GradientBoostingRegressor(random_state=RANDOM_STATE)

gbr.fit(X_train, y_train)

predictions = gbr.predict(X_test)

mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print("Error cuadrático medio (MSE) en el conjunto de prueba: {:.4f}".format(mse))
print("Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: {:.4f}".format(sqrt(mse)))
print("Error medio absoluto (MAE) en el conjunto de prueba: {:.4f}".format(mae))

Error cuadrático medio (MSE) en el conjunto de prueba: 3087.7676
Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: 55.5677
Error medio absoluto (MAE) en el conjunto de prueba: 29.6011


In [23]:
sgd = SGDRegressor(random_state=RANDOM_STATE)

sgd.fit(X_train, y_train)

sgd_predictions = sgd.predict(X_test)

mse = mean_squared_error(y_test, sgd_predictions)
mse = mean_squared_error(y_test, sgd_predictions)
mae = mean_absolute_error(y_test, sgd_predictions)
print("Error cuadrático medio (MSE) en el conjunto de prueba: {:.4f}".format(mse))
print("Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: {:.4f}".format(sqrt(mse)))
print("Error medio absoluto (MAE) en el conjunto de prueba: {:.4f}".format(mae))

Error cuadrático medio (MSE) en el conjunto de prueba: 4704.3240
Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: 68.5881
Error medio absoluto (MAE) en el conjunto de prueba: 44.4150


In [25]:
svr = SVR()

svr.fit(X_train, y_train)

svr_predictions = svr.predict(X_test)

mse = mean_squared_error(y_test, svr_predictions)
mae = mean_absolute_error(y_test, svr_predictions)
print("Error cuadrático medio (MSE) en el conjunto de prueba: {:.4f}".format(mse))
print("Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: {:.4f}".format(sqrt(mse)))
print("Error medio absoluto (MAE) en el conjunto de prueba: {:.4f}".format(mae))

Error cuadrático medio (MSE) en el conjunto de prueba: 3476.1320
Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: 58.9587
Error medio absoluto (MAE) en el conjunto de prueba: 29.6011


In [29]:
elastReg = ElasticNet()

elastReg.fit(X_train, y_train)

elastReg_predictions = elastReg.predict(X_test)

mse = mean_squared_error(y_test, elastReg_predictions)
mae = mean_absolute_error(y_test, elastReg_predictions)
print("Error cuadrático medio (MSE) en el conjunto de prueba: {:.4f}".format(mse))
print("Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: {:.4f}".format(sqrt(mse)))
print("Error medio absoluto (MAE) en el conjunto de prueba: {:.4f}".format(mae))

Error cuadrático medio (MSE) en el conjunto de prueba: 2557.5658
Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: 50.5724
Error medio absoluto (MAE) en el conjunto de prueba: 31.6706
