In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import SGDRegressor
from explainerdashboard import RegressionExplainer, ClassifierExplainer, ExplainerDashboard
from math import sqrt

RANDOM_STATE = 123

In [2]:
celonis_august_df = pd.read_excel('data/20230929132544_CELONIS_EXPORT.xlsx')
smt_august_df = pd.read_excel('data/08_I11_SMT_Incidents_of_own_solution_group(s).xlsx')

smt_august_df.columns = smt_august_df.iloc[11].values
smt_august_df = smt_august_df.iloc[12:]
smt_august_df = smt_august_df.reset_index(drop=True)




In [5]:
celonis_september_df = pd.read_excel('data/20231013102035_CELONIS_EXPORT.xlsx')
smt_september_df = pd.read_excel('data/09_I11_SMT_Incidents_of_own_solution_group(s).xlsx')

smt_september_df.columns = smt_september_df.iloc[11].values
smt_september_df = smt_september_df.iloc[12:]
smt_september_df = smt_september_df.reset_index(drop=True)

In [4]:
celonis_august_df.isnull().sum()

Incident Number                 0
Submit Date                     0
Service Name                    0
Site Group                      0
Site                            0
Incident Type                   0
Ticket Summary                  0
Submitter Group                 0
Last Assigned Organization      0
Last Assigned Group             0
Entry Channel                   0
Last Resolution Date            8
Template Name                 280
User Company                    0
Resolver Organization           0
Requisition ID                387
Reported Source                14
Last Resolution Time            8
CI                            231
Status                          0
Priority                        0
dtype: int64

In [4]:
celonis_august_df = celonis_august_df.drop(['Submit Date', 'Ticket Summary', 'Template Name', 'Requisition ID', 'CI', 'Last Resolution Date'], axis=1)
celonis_august_df.dropna(inplace=True)

In [5]:
celonis_august_df

Unnamed: 0,Incident Number,Service Name,Site Group,Site,Incident Type,Submitter Group,Last Assigned Organization,Last Assigned Group,Entry Channel,User Company,Resolver Organization,Reported Source,Last Resolution Time,Status,Priority
0,INC000025219797,MOZILLA FIREFOX |EMEA,ES,Md,User Service Request,ITSD - IT Service Desk,Service Solutions,ITSD - ES,CI-Hotline / ITSD,EU1,CI-Hotline / ITSD 1st Level,Phone,0.280556,Closed,Low
1,INC000025219902,SCCM PACKAGE INSTALLATION FAILURE,ES,Md,User Service Restoration,ITSD - IT Service Desk,Service Solutions,ITSD - SCCM Installation Support,CI-Hotline / ITSD,C,CI-Hotline / ITSD 2nd Level,Phone,480.225833,Closed,Low
2,INC000025221003,BUSINESS-OBJECTS_APPLICATION,ES,Md1,User Service Restoration,ITSD - IT Service Desk,Data and Digital,BI BO Platform support,CI-Hotline / ITSD,AE,Rest Support Organizations,Phone,1.920278,Closed,Low
3,INC000025221039,ITSD DOCUMENTED,ES,Md1,User Service Request,ITSD - IT Service Desk,Service Solutions,ITSD - ES,CI-Hotline / ITSD,BMG,CI-Hotline / ITSD 1st Level,Chat,749.219167,Closed,Medium
4,INC000025221092,MY COMPUTER - LOCAL SUPPORT |WORLD,ES,Md1,User Service Request,ITSD - IT Service Desk,Regional Services EMEA,Spain1 - EXT,CI-Hotline / ITSD,AE,IT Personal Support,Email,4.697778,Closed,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,INC000025416134,WCMS BGN,ES,Md1,User Service Request,ITSD - IT Service Desk,RBEI,CENTRAL inside Portal Advisor RBEI - EXT,CI-Hotline / ITSD,AE,Rest Support Organizations,Phone,136.461667,Closed,Low
436,INC000025416227,IDM - USERADMINISTRATION | ITSP ORDERFORM ACTI...,ES,Md1,User Service Request,IT Infrastructure Automation,Service Solutions,ITSD - IDM Expert Team,IT Service Portal,AE,CI-Hotline / ITSD 2nd Level,Self Service,2.861111,Closed,Low
437,INC000025416602,IDENTITIES (USERS) NEW / CHANGE / DELETE,ES,Md,User Service Request,IT Infrastructure Automation,Digital Workplace,Identity Lifecycle Support,AutomationBus,BD,Rest Support Organizations,Systems Management,188.251667,Resolved,Low
438,INC000025417081,SCANSYSTEM,ES,Md1,User Service Restoration,ITSD - IT Service Desk,Regional Services EMEA,Spain1 - EXT,CI-Hotline / ITSD,AE,IT Personal Support,Email,18.953333,Closed,Low


In [6]:
output = smt_august_df.isnull().sum()
output.to_csv('data/missing_values_output.csv')
output

Incident ID                                   0
Original Incident Number                    254
Requisition ID                              236
Created Date (UTC+0)                          0
Incident Type                                 0
Status                                        0
Status Reason                                 0
Company                                       0
Customer Department                           1
Region                                        0
Site Group                                    1
Site                                          1
Desk Location                                 1
Reported Source                               0
Summary                                       0
Impact                                        0
Open Priority                                 0
Current Priority                              0
Assigned Group                                0
Assigned Group Department                     0
In Progess Time (hhh:mi)\n              

In [7]:
smt_august_df = smt_august_df.drop(['Created Date (UTC+0)', 'Summary', 'Last Modified Date (UTC+0)', 'First Resolved Date (UTC+0)', 'Last Resolved Date (UTC+0)', 'Notes', 'Resolution', 
                      'Original Incident Number', 'Requisition ID', 'CI+', 'Product Name', 'Resolution Product Name', 'Primary Center Code'], axis=1)
smt_august_df.dropna(inplace=True)

In [8]:
smt_august_df

Unnamed: 0,Incident ID,Incident Type,Status,Status Reason,Company,Customer Department,Region,Site Group,Site,Desk Location,...,Product Categorization Tier 2,Product Categorization Tier 3,Resolution Categorization Tier 1,Resolution Categorization Tier 2,Resolution Categorization Tier 3,Resolution Product Categorization Tier 1,Resolution Product Categorization Tier 2,Resolution Product Categorization Tier 3,Target Date,Created by CI-Hotline
0,INC000024939028,User Service Restoration,Closed,No Further Action Required,GS,GS/HRS2-ES,EMEA,ES,MD,Md,...,End Device Service,Local Software Service,Application,Performance,- None -,End Device,Personal Computer,Standard Notebook Touch,02.08.2023 08:00,Yes
1,INC000025114665,User Service Request,Closed,No Further Action Required,PS,RBEF/LOG,EMEA,ES,AJ,Aj N-101,...,Infrastructure Service,Support Service,Request,- None -,- None -,Service,Infrastructure Service,Support Service,,Yes
2,INC000025172062,User Service Request,Closed,No Further Action Required,AA,AA/CTG1-EU,EMEA,ES,MD,Md,...,End Device Service,Personal Computer,Infrastructure,Hardware,- None -,End Device,Personal Computer,Standard Notebook Touch,01.08.2023 09:00,Yes
3,INC000025184174,User Service Request,Closed,No Further Action Required,GR,FCM2-Md,EMEA,ES,MD,Md 160P1,...,Application Service,Application Service,Application,Performance,- None -,End Device,Personal Computer,Standard Desktop Tiny,,No
4,INC000025197112,User Service Request,Closed,No Further Action Required,C,C/CGR-IB2,EMEA,ES,MD,Md 155P1,...,Application Service,Application Service,Request,- None -,- None -,Service,Application Service,Application Service,15.08.2023 22:00,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,INC000025416771,User Service Request,Resolved,No Further Action Required,PS,RBEF/HRL,EMEA,ES,AJ,Aj N-101,...,Application Service,Application Service,Request,- None -,- None -,Service,Application Service,Application Service,,No
250,INC000025416773,User Service Restoration,Resolved,No Further Action Required,PS,RBEF/HRL,EMEA,ES,AJ,Aj N-101,...,Application Service,Application Service,Application,- None -,- None -,Service,Application Service,Application Service,,No
251,INC000025416876,User Service Request,Resolved,No Further Action Required,PS,PS/MFT-PLA,EMEA,ES,AJ,Aj N-102,...,Business Service,Business Service,Request,- None -,- None -,Service,Business Service,Business Service,,Yes
252,INC000025416879,User Service Request,Resolved,No Further Action Required,PS,PS/MFT-PLA,EMEA,ES,AJ,Aj N-102,...,Application Service,Application Service,Request,- None -,- None -,Service,Application Service,Application Service,,Yes


In [9]:
combined_df = pd.merge(celonis_august_df, smt_august_df, how='inner', left_on='Incident Number', right_on='Incident ID')
combined_df.drop(['Service Name', 'Incident ID','In Progess Time (hhh:mi)\n', 'Actual Duration/Open Time (hhh:mi)', 'Customer Department', 'Desk Location', 
                  'Down Time of CI-Unavailability (hhh:mi)', 'Service+', 'Target Date'], axis=1, inplace=True) #Comprobar Target Date

#Merge en el otro sentido
#combined_df = pd.merge(smt_august_df, celonis_august_df, how='inner', left_on='Incident ID', right_on='Incident Number')
#combined_df.drop(['Service Name', 'Incident Number'], axis=1, inplace=True)

In [10]:
pd.set_option('display.max_columns', None)
combined_df

Unnamed: 0,Incident Number,Site Group_x,Site_x,Incident Type_x,Submitter Group,Last Assigned Organization,Last Assigned Group,Entry Channel,User Company,Resolver Organization,Reported Source_x,Last Resolution Time,Status_x,Priority,Incident Type_y,Status_y,Status Reason,Company,Region,Site Group_y,Site_y,Reported Source_y,Impact,Open Priority,Current Priority,Assigned Group,Assigned Group Department,Operational Categorization Tier 1,Operational Categorization Tier 2,Operational Categorization Tier 3,Product Categorization Tier 1,Product Categorization Tier 2,Product Categorization Tier 3,Resolution Categorization Tier 1,Resolution Categorization Tier 2,Resolution Categorization Tier 3,Resolution Product Categorization Tier 1,Resolution Product Categorization Tier 2,Resolution Product Categorization Tier 3,Created by CI-Hotline
0,INC000025221092,ES,Md1,User Service Request,ITSD - IT Service Desk,Regional Services EMEA,Spain1 - EXT,CI-Hotline / ITSD,AE,IT Personal Support,Email,4.697778,Closed,Low,User Service Request,Closed,No Further Action Required,AE,EMEA,ES,MD1,Email,4-Minor/Localized,Low,Low,Spain1 - EXT,BD/PFE-IA,Request,- None -,- None -,Service,End Device Service,Personal Computer,Application,Performance,- None -,End Device,Personal Computer,Standard Desktop Tiny,Yes
1,INC000025221966,ES,Md,User Service Request,ITSD - IT Service Desk,Regional Services EMEA,Spain1 - EXT,CI-Hotline / ITSD,EU2,IT Personal Support,Phone,4.116667,Closed,Low,User Service Request,Closed,No Further Action Required,EU2,EMEA,ES,MD,Phone,4-Minor/Localized,Low,Low,Spain1 - EXT,BD/PFE-IA,Request,- None -,- None -,Service,End Device Service,Personal Computer,Application,Performance,- None -,End Device,Personal Computer,Standard Notebook Touch,Yes
2,INC000025223159,ES,Md1,User Service Request,ITSD - IT Service Desk,Regional Services EMEA,Spain1 - EXT,CI-Hotline / ITSD,AE,IT Personal Support,Email,23.067500,Closed,Low,User Service Request,Closed,No Further Action Required,AE,EMEA,ES,MD1,Email,4-Minor/Localized,Low,Low,Spain1 - EXT,BD/PFE-IA,Request,- None -,- None -,Service,Business Service,Business Service,Infrastructure,Hardware,- None -,End Device,Personal Computer,Standard Desktop Tiny,Yes
3,INC000025223804,ES,Md1,User Service Request,ITSD - IT Service Desk,Regional Services EMEA,Spain1 - EXT,CI-Hotline / ITSD,AE,IT Personal Support,Email,22.799722,Closed,Low,User Service Request,Closed,No Further Action Required,AE,EMEA,ES,MD1,Email,4-Minor/Localized,Low,Low,Spain1 - EXT,BD/PFE-IA,Request,- None -,- None -,Service,IT Service,IT Service,Application,Performance,- None -,End Device,Personal Computer,Standard Desktop Tiny,Yes
4,INC000025224626,ES,Md,User Service Request,ITSD - IT Service Desk,Regional Services EMEA,Spain1,IT Service Portal,AA,IT Personal Support,Self Service,24.208611,Closed,Low,User Service Request,Closed,No Further Action Required,AA,EMEA,ES,MD,Self Service,4-Minor/Localized,Low,Low,Spain1,BD/PFE-IA,Request,- None -,- None -,Service,End Device Service,Mobile Device Service,Request,Hardware,- None -,Service,End Device Service,Mobile Device Service,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,INC000025414574,ES,Md1,User Service Restoration,Spain1,Regional Services EMEA,Spain1,IT Personal Support,AE,IT Personal Support,Walk In,0.017778,Closed,Low,User Service Restoration,Resolved,No Further Action Required,AE,EMEA,ES,MD1,Walk In,4-Minor/Localized,Low,Low,Spain1,BD/PFE-IA,Failure,- None -,- None -,Service,IT Service,IT Service,Application,- None -,- None -,Service,IT Service,IT Service,No
164,INC000025415111,ES,Md1,User Service Restoration,Spain1,Regional Services EMEA,Spain1,IT Personal Support,AE,IT Personal Support,Walk In,0.003333,Closed,Low,User Service Restoration,Resolved,No Further Action Required,AE,EMEA,ES,MD1,Walk In,4-Minor/Localized,Low,Low,Spain1,BD/PFE-IA,Failure,- None -,- None -,Service,End Device Service,Personal Computer,Application,- None -,- None -,End Device,Personal Computer,Convertible Notebook,Yes
165,INC000025415115,ES,Md,User Service Request,Spain1,Regional Services EMEA,Spain1,IT Personal Support,GR,IT Personal Support,Walk In,0.001389,Closed,Low,User Service Request,Resolved,No Further Action Required,GR,EMEA,ES,MD,Walk In,4-Minor/Localized,Low,Low,Spain1,BD/PFE-IA,Request,- None -,- None -,Service,End Device Service,Mobile Device Service,Request,- None -,- None -,End Device,Mobile Device,Standard Smartphone,Yes
166,INC000025415118,ES,Md,User Service Request,Spain1,Regional Services EMEA,Spain1,IT Personal Support,GR,IT Personal Support,Walk In,0.002222,Closed,Low,User Service Request,Resolved,No Further Action Required,GR,EMEA,ES,MD,Walk In,4-Minor/Localized,Low,Low,Spain1,BD/PFE-IA,Request,- None -,- None -,Service,End Device Service,Mobile Device Service,Request,- None -,- None -,End Device,Mobile Device,Standard Smartphone,Yes


In [11]:
columns_list = combined_df.columns
columns_list = [elem for elem in columns_list if elem not in ['Incident Number', 'Last Resolution Time', 'Incident ID']] 
combined_df_one_hot = pd.get_dummies(combined_df, columns=columns_list)
combined_df_one_hot

Unnamed: 0,Incident Number,Last Resolution Time,Site Group_x_ES,Site_x_Md,Site_x_Md1,Incident Type_x_Infrastructure Restoration,Incident Type_x_User Service Request,Incident Type_x_User Service Restoration,Submitter Group_Human Resources - Personnel Administration,Submitter Group_IT Infrastructure Automation,Submitter Group_ITSD - IT Service Desk,Submitter Group_Spain1,Submitter Group_Spain1 - EXT,Last Assigned Organization_Regional Services EMEA,Last Assigned Group_Spain1,Last Assigned Group_Spain1 - EXT,Entry Channel_AutomationBus,Entry Channel_CI-Hotline / ITSD,Entry Channel_IT Personal Support,Entry Channel_IT Service Portal,Entry Channel_IT Support Page,Entry Channel_Other,User Company_AA,User Company_AE,User Company_BD,User Company_BMG,User Company_BT,User Company_C,User Company_EU1,User Company_EU2,User Company_GR,User Company_GS,User Company_HC,User Company_M,User Company_PT,User Company_SO,Resolver Organization_IT Personal Support,Reported Source_x_Chat,Reported Source_x_Direct Input,Reported Source_x_Email,Reported Source_x_Phone,Reported Source_x_Self Service,Reported Source_x_Systems Management,Reported Source_x_Walk In,Status_x_Closed,Priority_Low,Priority_Medium,Incident Type_y_Infrastructure Restoration,Incident Type_y_User Service Request,Incident Type_y_User Service Restoration,Status_y_Closed,Status_y_Resolved,Status Reason_Customer Follow-Up Required,Status Reason_No Further Action Required,Company_AA,Company_AE,Company_BD,Company_BMG,Company_BT,Company_C,Company_EU1,Company_EU2,Company_GR,Company_GS,Company_HC,Company_M,Company_PT,Company_SO,Region_EMEA,Site Group_y_ES,Site_y_MD,Site_y_MD1,Reported Source_y_Chat,Reported Source_y_Direct Input,Reported Source_y_Email,Reported Source_y_Phone,Reported Source_y_Self Service,Reported Source_y_Systems Management,Reported Source_y_Walk In,Impact_3-Moderate/Limited,Impact_4-Minor/Localized,Open Priority_-1,Open Priority_Low,Open Priority_Medium,Current Priority_Low,Current Priority_Medium,Assigned Group_Spain1,Assigned Group_Spain1 - EXT,Assigned Group Department_BD/PFE-IA,Operational Categorization Tier 1_Failure,Operational Categorization Tier 1_Request,Operational Categorization Tier 2_- None -,Operational Categorization Tier 2_Add,Operational Categorization Tier 2_Application,Operational Categorization Tier 2_Consult,Operational Categorization Tier 2_Reset,Operational Categorization Tier 3_- None -,Operational Categorization Tier 3_Password,Product Categorization Tier 1_Service,Product Categorization Tier 2_Application Service,Product Categorization Tier 2_Business Service,Product Categorization Tier 2_Commercial Service,Product Categorization Tier 2_End Device Service,Product Categorization Tier 2_IT Service,Product Categorization Tier 2_Infrastructure Service,Product Categorization Tier 3_Application Service,Product Categorization Tier 3_Business Service,Product Categorization Tier 3_Communication Service,Product Categorization Tier 3_File Service,Product Categorization Tier 3_IT Service,Product Categorization Tier 3_Local Software Service,Product Categorization Tier 3_Mobile Device Service,Product Categorization Tier 3_Peacy Package,Product Categorization Tier 3_Personal Computer,Product Categorization Tier 3_Print Service,Product Categorization Tier 3_SAP Business Service,Product Categorization Tier 3_Security Service,Product Categorization Tier 3_Support Service,Resolution Categorization Tier 1_Application,Resolution Categorization Tier 1_Handling,Resolution Categorization Tier 1_Infrastructure,Resolution Categorization Tier 1_Request,Resolution Categorization Tier 2_- None -,Resolution Categorization Tier 2_Documentation,Resolution Categorization Tier 2_Education,Resolution Categorization Tier 2_Hardware,Resolution Categorization Tier 2_Network Performance,Resolution Categorization Tier 2_Performance,Resolution Categorization Tier 2_Software,Resolution Categorization Tier 2_Unclear,Resolution Categorization Tier 2_User Error,Resolution Categorization Tier 2_User Rights,Resolution Categorization Tier 3_- None -,Resolution Product Categorization Tier 1_End Device,Resolution Product Categorization Tier 1_Service,Resolution Product Categorization Tier 2_Application Service,Resolution Product Categorization Tier 2_Business Service,Resolution Product Categorization Tier 2_Commercial Service,Resolution Product Categorization Tier 2_End Device Service,Resolution Product Categorization Tier 2_IT Service,Resolution Product Categorization Tier 2_Industrial Equipment,Resolution Product Categorization Tier 2_Infrastructure Service,Resolution Product Categorization Tier 2_Mobile Device,Resolution Product Categorization Tier 2_Personal Computer,Resolution Product Categorization Tier 2_Printer,Resolution Product Categorization Tier 3_Application Service,Resolution Product Categorization Tier 3_Business Service,Resolution Product Categorization Tier 3_Color Laser A3 MFP,Resolution Product Categorization Tier 3_Communication Service,Resolution Product Categorization Tier 3_Convertible Notebook,Resolution Product Categorization Tier 3_Enhanced Notebook,Resolution Product Categorization Tier 3_Enhanced Smartphone,Resolution Product Categorization Tier 3_Feature Phone,Resolution Product Categorization Tier 3_File Service,Resolution Product Categorization Tier 3_IT Service,Resolution Product Categorization Tier 3_Mobile Device Service,Resolution Product Categorization Tier 3_Monochrome Laser A4 MFP,Resolution Product Categorization Tier 3_Monochrome Laser A4 Performance Printer,Resolution Product Categorization Tier 3_Peacy Package,Resolution Product Categorization Tier 3_Performance Notebook,Resolution Product Categorization Tier 3_Personal Computer,Resolution Product Categorization Tier 3_SAP Business Service,Resolution Product Categorization Tier 3_Security Service,Resolution Product Categorization Tier 3_Smart Handheld and Wearables,Resolution Product Categorization Tier 3_Standard Desktop,Resolution Product Categorization Tier 3_Standard Desktop Tiny,Resolution Product Categorization Tier 3_Standard Notebook,Resolution Product Categorization Tier 3_Standard Notebook Touch,Resolution Product Categorization Tier 3_Standard Smartphone,Resolution Product Categorization Tier 3_Standard iPad (WWAN),Resolution Product Categorization Tier 3_Support Service,Created by CI-Hotline_No,Created by CI-Hotline_Yes
0,INC000025221092,4.697778,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,INC000025221966,4.116667,1,1,0,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,INC000025223159,23.067500,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,INC000025223804,22.799722,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,INC000025224626,24.208611,1,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,INC000025414574,0.017778,1,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
164,INC000025415111,0.003333,1,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
165,INC000025415115,0.001389,1,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
166,INC000025415118,0.002222,1,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [12]:
important_columns = ['Incident Number', 'Site Group_x', 'Site_x', 'Incident Type_x', 'Submitter Group', 'Last Assigned Organization', 'Last Assigned Group', 'Entry Channel', 'User Company', 'Resolver Organization', 
                     'Reported Source', 'Status_x', '	Priority', 'Incident Type', 'Status_y', 'Status Reason', 'Company', 'Region', 'Site Group_y', 'Site_y', 'Reported Source_y', 'Impact', 'Open Priority',
                     'Current Priority', 'Assigned Group', 'Assigned Group Department', 'Operational Categorization Tier 1', 'Operational Categorization Tier 2', 'Operational Categorization Tier 3', 
                     'Product Categorization Tier 1', 'Product Categorization Tier 2', 'Product Categorization Tier 3', 'Resolution Categorization Tier 1', 'Resolution Categorization Tier 2', 'Resolution Categorization Tier 3',
                     'Resolution Product Categorization Tier 1', 'Resolution Product Categorization Tier 2', 'Resolution Product Categorization Tier 3', 'Created by CI-Hotline']
doubt_columns = ['User Company', 'Status_x', 'Status_y', 'Company', 'Region']
many_values = ['Customer Department', 'Desk Location', 'Service+'] 
#Desk Location en duda, tiene como 40 valores
#Service+ puede ser importante aunque tenga muchos valores, unos 40

In [13]:
X = combined_df_one_hot.drop(['Incident Number', 'Last Resolution Time'], axis=1).values
y = combined_df_one_hot['Last Resolution Time'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [14]:
gbr = GradientBoostingRegressor(random_state=RANDOM_STATE, n_estimators=600, max_depth=4, learning_rate=0.01, loss='squared_error')

gbr.fit(X_train, y_train)

In [15]:
predictions = gbr.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Error cuadrático medio (MSE) en el conjunto de prueba: {:.4f}".format(mse))
print("Raíz del error cuadrático medio (RMSE) en el conjunto de prueba: {:.4f}".format(sqrt(mse)))


predictions

The mean squared error (MSE) on test set: 771.0912


array([-7.61477911e-02,  2.47384714e+01, -7.61477911e-02,  2.41492224e+01,
        3.49666035e+01,  1.47284686e-01,  2.54445485e+01,  2.91540575e-01,
        4.08538773e+00,  1.47284686e-01,  2.91540575e-01,  5.88056577e+00,
        2.55476960e+01,  5.97473815e+00, -7.61477911e-02,  5.88056577e+00,
        2.50768602e+01,  2.91540575e-01,  2.78323837e+01, -4.14536595e-01,
       -7.61477911e-02,  7.83374832e+00,  1.47284686e-01,  5.88201485e+00,
        1.30236882e+01,  9.58504182e+01,  2.54538041e+01, -4.14536595e-01,
        7.07361410e+01, -7.61477911e-02,  1.35682132e+02,  4.59577241e+01,
        2.91540575e-01,  5.88201485e+00])

In [16]:
X = combined_df_one_hot.drop(['Incident Number', 'Last Resolution Time'], axis=1).values
y = combined_df_one_hot['Last Resolution Time'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

sgd = SGDRegressor(random_state=RANDOM_STATE)

sgd.fit(X_train, y_train)

predictions = sgd.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

predictions

The mean squared error (MSE) on test set: 858.5923


array([ -2.17472946,  86.67999797,   5.10560715,  55.86650852,
        47.82977627,  -9.45779477,  64.36885198,   2.19652612,
        -4.06652238,   3.01741766,  -0.64064952,   5.9248146 ,
        29.8176154 ,   6.71855325,  30.76712811,   5.9248146 ,
        66.25068092,  -8.93444312,  73.74476292,  13.37670287,
         4.8720937 ,   0.49030549,  25.19352579,   0.59735233,
       -12.35400614,  49.05411502,  26.37407126,  22.4806692 ,
        85.86458745,   5.10560715, 105.28276377,  84.52385762,
         5.64633366,  17.12809624])

In [17]:
len(combined_df_one_hot.index) * 0.2 # 33.6 aproxima a 34
first_test_index = len(combined_df_one_hot.index) - 34
X_test_df = combined_df_one_hot.iloc[first_test_index:].drop(['Incident Number', 'Last Resolution Time'], axis=1)
y_test_df = combined_df_one_hot.iloc[first_test_index:][['Last Resolution Time']]

In [18]:
explainer = RegressionExplainer(gbr, X_test_df, y_test_df)

ExplainerDashboard(explainer).run()


Generating self.shap_explainer = shap.TreeExplainer(model)
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...


Calculating predictions...
Calculating residuals...
Calculating absolute residuals...
Calculating shap interaction values...
Reminder: TreeShap computational complexity is O(TLD^2), where T is the number of trees, L is the maximum number of leaves in any tree and D the maximal depth of any tree. So reducing these will speed up the calculation.
Calculating dependencies...
Calculating importances...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://192.168.1.40:8050


ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=8050): Max retries exceeded with url: /_alive_a6b7391f-8d4e-40ae-ba23-e8aee38e86fa (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002062DE5CB50>: Failed to establish a new connection: [WinError 10049] La dirección solicitada no es válida en este contexto'))

In [19]:
explainer = RegressionExplainer(sgd, X_test_df, y_test_df)

ExplainerDashboard(explainer).run()

Generating self.shap_explainer = shap.KernelExplainer(model, X)...
Building ExplainerDashboard..
Detected notebook environment, consider setting mode='external', mode='inline' or mode='jupyterlab' to keep the notebook interactive while the dashboard is running...
For this type of model and model_output interactions don't work, so setting shap_interaction=False...
The explainer object has no decision_trees property. so setting decision_trees=False...
Generating layout...
Calculating shap values...


100%|██████████| 34/34 [00:22<00:00,  1.50it/s]

Calculating predictions...
Calculating residuals...
Calculating absolute residuals...





Calculating dependencies...
Calculating importances...
Reminder: you can store the explainer (including calculated dependencies) with explainer.dump('explainer.joblib') and reload with e.g. ClassifierExplainer.from_file('explainer.joblib')
Registering callbacks...
Starting ExplainerDashboard on http://192.168.1.40:8050


ConnectionError: HTTPConnectionPool(host='0.0.0.0', port=8050): Max retries exceeded with url: /_alive_a6b7391f-8d4e-40ae-ba23-e8aee38e86fa (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002062E2CB340>: Failed to establish a new connection: [WinError 10049] La dirección solicitada no es válida en este contexto'))

In [None]:
#pd.reset_option('display.max_columns')
