In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [7]:
file = "data/equipment_maintenance_data.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,equipment_id,equipment_type,location,install_date,last_service_date,next_scheduled_service,age_days,runtime_hours,temperature,vibration_level,power_consumption_kw,humidity_level,error_codes_count,manual_override,downtime_last_30d,service_priority,failure_within_7_days
0,dda080ad-7ead-4676-92e2-b6d818addfcc,HVAC,Building A,2023-08-04,2024-01-26,2024-03-31,742,5496.71,73.62,3.97,27.62,39.36,0,0,0.12,Medium,0
1,ae06aedb-0777-4db1-9569-4100c15b0714,Boiler,Building A,2022-12-26,2023-01-25,2023-03-19,963,5279.04,85.11,2.13,17.37,41.0,1,0,1.13,Low,0
2,996075e0-3992-4692-972c-8c19f5a66494,Boiler,Building A,2022-11-14,2024-10-12,2025-03-30,1005,5816.45,59.76,2.36,16.29,51.98,2,1,1.44,Low,0
3,ad28b74b-5d3d-48dd-868a-414bef8e2822,Boiler,Building C,2023-11-11,2023-11-15,2024-01-24,643,4455.62,76.11,1.27,21.88,33.9,4,1,2.31,Low,0
4,20f85bff-9835-4c24-841b-6e16984d60a5,Elevator,Building B,2024-06-14,2024-10-26,2024-12-18,427,3978.45,73.38,2.2,19.97,84.56,1,1,0.75,Low,0


#### Data check performance
1) Check missing values
2) Check duplicates
3) Check data type
4) Check special charactors
5) Check the number of unique valuesof each column
6) Check statistics of data set
7) Check various categories present in the different category columns

In [8]:
df.isnull().sum()  # Check for missing values 

equipment_id              0
equipment_type            0
location                  0
install_date              0
last_service_date         0
next_scheduled_service    0
age_days                  0
runtime_hours             0
temperature               0
vibration_level           0
power_consumption_kw      0
humidity_level            0
error_codes_count         0
manual_override           0
downtime_last_30d         0
service_priority          0
failure_within_7_days     0
dtype: int64

In [9]:
df.duplicated().sum()  # Check for duplicate rows

np.int64(0)

In [10]:
df.info() # Check data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   equipment_id            1000 non-null   object 
 1   equipment_type          1000 non-null   object 
 2   location                1000 non-null   object 
 3   install_date            1000 non-null   object 
 4   last_service_date       1000 non-null   object 
 5   next_scheduled_service  1000 non-null   object 
 6   age_days                1000 non-null   int64  
 7   runtime_hours           1000 non-null   float64
 8   temperature             1000 non-null   float64
 9   vibration_level         1000 non-null   float64
 10  power_consumption_kw    1000 non-null   float64
 11  humidity_level          1000 non-null   float64
 12  error_codes_count       1000 non-null   int64  
 13  manual_override         1000 non-null   int64  
 14  downtime_last_30d       1000 non-null   f

In [13]:
df.nunique()  # Check the number of unique values in each column

equipment_id              1000
equipment_type               5
location                     5
install_date               731
last_service_date          678
next_scheduled_service     699
age_days                   731
runtime_hours              999
temperature                882
vibration_level            492
power_consumption_kw       780
humidity_level             934
error_codes_count            6
manual_override              2
downtime_last_30d          425
service_priority             3
failure_within_7_days        2
dtype: int64

In [15]:
df.describe()  # Get statistics of the dataset

Unnamed: 0,age_days,runtime_hours,temperature,vibration_level,power_consumption_kw,humidity_level,error_codes_count,manual_override,downtime_last_30d,failure_within_7_days
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1090.827,5000.65231,74.4823,2.97841,20.39587,60.6458,1.401,0.478,1.90499,0.048
std,411.299828,995.730163,9.780358,1.570604,4.938212,17.350458,1.130255,0.499766,1.978708,0.213873
min,368.0,1980.49,41.52,-1.65,5.52,30.32,0.0,0.0,0.0,0.0
25%,754.0,4347.915,68.02,1.9175,17.2575,45.57,1.0,0.0,0.5075,0.0
50%,1089.0,4975.31,74.945,3.025,20.175,61.13,1.0,0.0,1.3,0.0
75%,1441.0,5675.1575,80.9125,4.06,23.67,75.735,2.0,1.0,2.55,0.0
max,1823.0,7935.66,104.85,7.73,35.97,89.96,5.0,1.0,13.09,1.0


### Insight
- From the above description of numerical data, has a big gap in mean from .048 to 5000
- Standard deviation also has a big gap from .21 to 995.73 
- Minimum value also from -1 vibration level to 1980 runtime hours

In [16]:
df.shape  # Get the shape of the dataset

(1000, 17)

### EXPLORER

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   equipment_id            1000 non-null   object 
 1   equipment_type          1000 non-null   object 
 2   location                1000 non-null   object 
 3   install_date            1000 non-null   object 
 4   last_service_date       1000 non-null   object 
 5   next_scheduled_service  1000 non-null   object 
 6   age_days                1000 non-null   int64  
 7   runtime_hours           1000 non-null   float64
 8   temperature             1000 non-null   float64
 9   vibration_level         1000 non-null   float64
 10  power_consumption_kw    1000 non-null   float64
 11  humidity_level          1000 non-null   float64
 12  error_codes_count       1000 non-null   int64  
 13  manual_override         1000 non-null   int64  
 14  downtime_last_30d       1000 non-null   f

In [22]:
print("Categories in 'equipment_id' variable:     ",end=" " )
print(df['equipment_id'].unique())

print("Categories in 'equipment_type' variable:  ",end=" ")
print(df['equipment_type'].unique())

print("Categories in'location' variable:",end=" " )
print(df['location'].unique())

print("Categories in 'install_date' variable:     ",end=" " )
print(df['install_date'].unique())

print("Categories in 'last_service_date' variable:     ",end=" " )
print(df['last_service_date'].unique())

print("Categories in 'next_scheduled_service' variable:     ",end=" " )
print(df['next_scheduled_service'].unique())

print("Categories in 'service_priority' variable:     ",end=" " )
print(df['service_priority'].unique())

Categories in 'equipment_id' variable:      ['dda080ad-7ead-4676-92e2-b6d818addfcc'
 'ae06aedb-0777-4db1-9569-4100c15b0714'
 '996075e0-3992-4692-972c-8c19f5a66494'
 'ad28b74b-5d3d-48dd-868a-414bef8e2822'
 '20f85bff-9835-4c24-841b-6e16984d60a5'
 '2a6a6d3f-e8d9-4df7-859b-25d3c3521729'
 'fc7f2bea-c73f-49ae-981d-0d84750e5852'
 '9ef3c060-c2f1-4889-aa7c-3fb0f942b93f'
 '73f21f8a-d787-421c-ac39-f00c039dbf94'
 'a9fd7643-0fa0-472a-b72d-7954dd0b73a2'
 '1e0f7f59-a1c7-412c-a81c-902697ac476d'
 '60b1daab-c125-47da-b20b-269c16ce87fc'
 '4c6b65f7-32dc-4b1e-b12d-2e050617e442'
 'f88a6f3b-c7f5-4357-b29d-425c39859458'
 'f116f4c6-14b9-42e2-9eb1-94200c4283f8'
 'ac12de5e-dbc1-4f27-a0de-bff4117935df'
 'd9cec206-57a2-472f-8655-b5e9cf12a76b'
 'a49650ec-62c8-4a9c-8053-f04c49cb7c6e'
 '67cb6a40-8c9c-4f8b-8e28-8784f2be51f1'
 '0d87c814-8ad8-4891-acc1-19e9dae2c542'
 '106f9500-f328-4d03-8721-a7bd97bff654'
 'd0ab6669-44e6-4447-bdd6-e005912ef5d5'
 '4d78bdfa-8ada-4456-8b17-ffe395a37ede'
 '945a3b0f-18ae-44a6-a098-61350da3f2

In [23]:
# define numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 10 numerical features : ['age_days', 'runtime_hours', 'temperature', 'vibration_level', 'power_consumption_kw', 'humidity_level', 'error_codes_count', 'manual_override', 'downtime_last_30d', 'failure_within_7_days']

We have 7 categorical features : ['equipment_id', 'equipment_type', 'location', 'install_date', 'last_service_date', 'next_scheduled_service', 'service_priority']


In [24]:
df.to_csv("data/eq_maintenance_updated_data.csv", index=False)