In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from pandas_profiling import ProfileReport
from pathlib import Path

In [46]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [47]:
df = pd.read_csv("1.a.Detail_Incident.csv", parse_dates=['Open_Time', 'Reopen_Time', 'Resolved_Time','Close_Time', ])

In [48]:
df.dtypes

CI_Name_aff                           object
CI_Type_aff                           object
CI_Subtype_aff                        object
Service_Component_WBS_aff             object
Incident_ID                           object
Status                                object
Impact                                 int64
Urgency                                int64
Priority                               int64
Category                              object
KM_number                             object
Alert_Status                          object
Count_Reassignments                  float64
Open_Time                     datetime64[ns]
Reopen_Time                   datetime64[ns]
Resolved_Time                 datetime64[ns]
Close_Time                    datetime64[ns]
Handle_Time_Hours                    float64
Closure_Code                          object
Count_Related_Interactions           float64
Related_Interaction                   object
Count_Related_Incidents              float64
Count_Rela

## Drop Records where Resolved_Time is Missing

In [49]:
df.iloc[:,13:17].isnull().sum()

Open_Time            0
Reopen_Time      44322
Resolved_Time     1780
Close_Time           0
dtype: int64

In [50]:
df = df.dropna(subset=['Resolved_Time'])

In [51]:
df.iloc[:,13:17].isnull().sum()

Open_Time            0
Reopen_Time      42607
Resolved_Time        0
Close_Time           0
dtype: int64

## Limit timeframe of all records

greater than 1 october 2013

less than 31 march 2014


In [52]:
df = df[df['Open_Time'] >= pd.to_datetime('10-01-2013')]

In [53]:
df.iloc[:,13:17].describe()

Unnamed: 0,Open_Time,Reopen_Time,Resolved_Time,Close_Time
count,43709,2038,43709,43709
unique,43455,2036,43496,43500
top,2014-01-22 15:46:06,2013-11-12 10:36:33,2013-11-22 16:34:33,2014-02-27 15:04:32
freq,3,2,3,3
first,2013-10-01 07:33:21,2013-10-01 11:43:47,2013-10-01 08:18:27,2013-10-01 08:18:30
last,2014-03-31 17:24:49,2014-03-31 16:21:15,2014-03-31 22:47:29,2014-03-31 22:47:32


## Determine if 'work in progress' remains

In [54]:
df.Status.value_counts()

Closed              43700
Work in progress        9
Name: Status, dtype: int64

It does, remove those records

In [55]:
df = df[ df['Status'] == 'Closed' ]

## Remove non-incident records

In [56]:
df.Category.value_counts()

incident                   35208
request for information     8482
complaint                      9
request for change             1
Name: Category, dtype: int64

In [57]:
df = df[ df['Category'] == 'incident' ]

In [58]:
print(df.Category.value_counts())
print(df.Status.value_counts())
print(df.Alert_Status.value_counts())

incident    35208
Name: Category, dtype: int64
Closed    35208
Name: Status, dtype: int64
closed    35208
Name: Alert_Status, dtype: int64


## Drop the columns with constant values

In [59]:
df = df.drop(['Category', 'Status', 'Alert_Status'], axis='columns')

In [60]:
df.head()

Unnamed: 0,CI_Name_aff,CI_Type_aff,CI_Subtype_aff,Service_Component_WBS_aff,Incident_ID,Impact,Urgency,Priority,KM_number,Count_Reassignments,Open_Time,Reopen_Time,Resolved_Time,Close_Time,Handle_Time_Hours,Closure_Code,Count_Related_Interactions,Related_Interaction,Count_Related_Incidents,Count_Related_Changes,Related_Change,CI_Name_CBy,CI_Type_CBy,CI_Subtype_CBy,ServiceComp_WBS_CBy
1125,APP000005,application,Citrix,WBS000292,IM0001224,4,4,4,KM0001060,0.0,2013-10-01 07:33:21,NaT,2013-10-01 08:36:09,2013-10-01 08:37:28,1.068611,Software,1.0,SD0001630,,,,APP000005,application,Citrix,WBS000292
1127,DSK000457,computer,Desktop,WBS000187,IM0001226,5,5,5,KM0001446,2.0,2013-10-01 08:18:38,NaT,2013-10-07 11:51:53,2013-10-07 11:52:00,45.556111,Hardware,1.0,SD0001635,,,,DSK000457,computer,Desktop,WBS000187
1130,SBA000263,application,Server Based Application,WBS000072,IM0001229,5,5,5,KM0000644,12.0,2013-10-01 08:36:09,NaT,2013-10-07 16:55:08,2013-10-07 16:55:38,50.324722,Other,1.0,SD0001644,,,,#N/B,#N/B,#N/B,#N/B
1131,SBA000154,application,Server Based Application,WBS000027,IM0001230,5,5,5,KM0001291,1.0,2013-10-01 08:39:37,NaT,2013-10-01 15:39:47,2013-10-01 15:39:51,7.003889,Other,1.0,SD0001640,,,,SBA000154,application,Server Based Application,WBS000027
1132,LAP000019,computer,Laptop,WBS000091,IM0001231,5,5,5,KM0000315,0.0,2013-10-01 08:40:00,NaT,2013-10-02 11:30:13,2013-10-02 11:30:19,0.020278,Other,1.0,SD0001638,,,,KYB000001,hardware,Keyboard,WBS000091


## END and OUTPUT

In [61]:
with open("2.a.Detail_Incident.csv",'w') as f:
    df.to_csv(f, index=False)

In [62]:
df.reset_index(drop=True, inplace=True)
profile = ProfileReport(df, title="Profile of BPIC 2014 Detail_Incident Data after Secondary Cleaning", html={'style': {'full_width': True}})

In [63]:
profile.to_file(Path(str("2.b.Detail_Incident_Profile.html")))