## 1.Import Data and Required Packages
### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [21]:
file_path = r"C:\Users\LENOVO\1. Projects\Bodycode-Anomaly-Detection-Project.\artifacts\data_ingestion\data\All_vib.csv"
df = pd.read_csv(file_path)

In [22]:
df.head()

Unnamed: 0,Sensor_Name,TimeStamp,fatigue_mmps,impact_mg0,friction_mg0,crest,Temperature_C,Temperature_F,status
0,V9 VVB001 Fan Motor,01/11/2023 02:18:58,0.0,20.394324,10.197162,3.8,25.9,78.62,Device is OK
1,V9 VVB001 Fan Motor,01/11/2023 02:48:38,0.1,30.591486,10.197162,4.1,26.5,79.7,Device is OK
2,V9 VVB001 Fan Motor,01/11/2023 02:49:38,0.1,30.591486,10.197162,4.4,26.5,79.7,Device is OK
3,V9 VVB001 Fan Motor,01/11/2023 05:36:46,0.1,30.591486,10.197162,3.8,24.5,76.1,Device is OK
4,V9 VVB001 Fan Motor,01/11/2023 05:37:47,0.1,30.591486,10.197162,3.7,24.5,76.1,Device is OK


## 2 Data Checks to perform
1. Check Missing values
2. Check Duplicates
3. Check data type
4. Check the number of unique values of each column
5. Check statistics of data set
6. Check various categories present in the different categorical column

In [23]:
df.isna().sum()

Sensor_Name      0
TimeStamp        0
fatigue_mmps     0
impact_mg0       0
friction_mg0     0
crest            0
Temperature_C    0
Temperature_F    0
status           0
dtype: int64

In [24]:
df.duplicated().sum()

74

In [25]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [26]:
df.head()

Unnamed: 0,Sensor_Name,TimeStamp,fatigue_mmps,impact_mg0,friction_mg0,crest,Temperature_C,Temperature_F,status
0,V9 VVB001 Fan Motor,01/11/2023 02:18:58,0.0,20.394324,10.197162,3.8,25.9,78.62,Device is OK
1,V9 VVB001 Fan Motor,01/11/2023 02:48:38,0.1,30.591486,10.197162,4.1,26.5,79.7,Device is OK
2,V9 VVB001 Fan Motor,01/11/2023 02:49:38,0.1,30.591486,10.197162,4.4,26.5,79.7,Device is OK
3,V9 VVB001 Fan Motor,01/11/2023 05:36:46,0.1,30.591486,10.197162,3.8,24.5,76.1,Device is OK
4,V9 VVB001 Fan Motor,01/11/2023 05:37:47,0.1,30.591486,10.197162,3.7,24.5,76.1,Device is OK


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160550 entries, 0 to 160623
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Sensor_Name    160550 non-null  object 
 1   TimeStamp      160550 non-null  object 
 2   fatigue_mmps   160550 non-null  float64
 3   impact_mg0     160550 non-null  float64
 4   friction_mg0   160550 non-null  float64
 5   crest          160550 non-null  float64
 6   Temperature_C  160550 non-null  float64
 7   Temperature_F  160550 non-null  float64
 8   status         160550 non-null  object 
dtypes: float64(6), object(3)
memory usage: 12.2+ MB


In [28]:
df.describe()

Unnamed: 0,fatigue_mmps,impact_mg0,friction_mg0,crest,Temperature_C,Temperature_F
count,160550.0,160550.0,160550.0,160550.0,160550.0,160550.0
mean,0.127472,103.761317,28.032573,3.812348,20.704534,69.268161
std,0.112865,54.788068,12.669365,0.373445,5.241238,9.434228
min,0.0,20.394324,10.197162,2.3,9.9,49.82
25%,0.1,91.774459,20.394324,3.6,16.7,62.06
50%,0.1,112.168783,30.591486,3.7,20.7,69.26
75%,0.1,112.168783,30.591486,4.0,24.1,75.38
max,1.6,3874.921598,968.730399,11.9,39.2,102.56


In [29]:
df.nunique()

Sensor_Name           1
TimeStamp        159927
fatigue_mmps         17
impact_mg0           61
friction_mg0         17
crest                68
Temperature_C       286
Temperature_F       286
status                1
dtype: int64

In [30]:
df.head()

Unnamed: 0,Sensor_Name,TimeStamp,fatigue_mmps,impact_mg0,friction_mg0,crest,Temperature_C,Temperature_F,status
0,V9 VVB001 Fan Motor,01/11/2023 02:18:58,0.0,20.394324,10.197162,3.8,25.9,78.62,Device is OK
1,V9 VVB001 Fan Motor,01/11/2023 02:48:38,0.1,30.591486,10.197162,4.1,26.5,79.7,Device is OK
2,V9 VVB001 Fan Motor,01/11/2023 02:49:38,0.1,30.591486,10.197162,4.4,26.5,79.7,Device is OK
3,V9 VVB001 Fan Motor,01/11/2023 05:36:46,0.1,30.591486,10.197162,3.8,24.5,76.1,Device is OK
4,V9 VVB001 Fan Motor,01/11/2023 05:37:47,0.1,30.591486,10.197162,3.7,24.5,76.1,Device is OK


In [31]:
selected_columns = ['Sensor_Name', 'TimeStamp', 'fatigue_mmps', 'impact_mg0', 'friction_mg0', 'crest', 'Temperature_C', 'status']
df = df[selected_columns]

In [32]:
# Convert TimeStamp column to datetime format
df.loc[:, 'TimeStamp'] = pd.to_datetime(df['TimeStamp'], format='%d/%m/%Y %H:%M:%S')
combined_df1 = df

# make time stamp as index and sort by the index
df = df.set_index('TimeStamp')
df = df.sort_index()
df.head()

Unnamed: 0_level_0,Sensor_Name,fatigue_mmps,impact_mg0,friction_mg0,crest,Temperature_C,status
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-10-27 08:29:03,V9 VVB001 Fan Motor,0.0,20.394324,10.197162,3.7,22.5,Device is OK
2023-10-27 08:30:13,V9 VVB001 Fan Motor,0.0,30.591486,10.197162,4.3,22.5,Device is OK
2023-10-27 08:30:23,V9 VVB001 Fan Motor,0.0,20.394324,10.197162,4.0,22.5,Device is OK
2023-10-27 08:30:33,V9 VVB001 Fan Motor,0.0,30.591486,10.197162,4.7,22.5,Device is OK
2023-10-27 08:31:53,V9 VVB001 Fan Motor,0.0,20.394324,10.197162,3.9,22.5,Device is OK


In [33]:
# make time stamp as index and sort by the index
combined_df1 = combined_df1.set_index('TimeStamp')
combined_df1 = combined_df1.sort_index()
combined_df1

Unnamed: 0_level_0,Sensor_Name,fatigue_mmps,impact_mg0,friction_mg0,crest,Temperature_C,status
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-10-27 08:29:03,V9 VVB001 Fan Motor,0.0,20.394324,10.197162,3.7,22.5,Device is OK
2023-10-27 08:30:13,V9 VVB001 Fan Motor,0.0,30.591486,10.197162,4.3,22.5,Device is OK
2023-10-27 08:30:23,V9 VVB001 Fan Motor,0.0,20.394324,10.197162,4.0,22.5,Device is OK
2023-10-27 08:30:33,V9 VVB001 Fan Motor,0.0,30.591486,10.197162,4.7,22.5,Device is OK
2023-10-27 08:31:53,V9 VVB001 Fan Motor,0.0,20.394324,10.197162,3.9,22.5,Device is OK
...,...,...,...,...,...,...,...
2023-12-12 11:05:13,V9 VVB001 Fan Motor,0.1,101.971621,30.591486,3.3,30.0,Device is OK
2023-12-12 11:05:15,V9 VVB001 Fan Motor,0.1,101.971621,30.591486,3.4,30.0,Device is OK
2023-12-12 11:05:25,V9 VVB001 Fan Motor,0.1,101.971621,30.591486,3.3,30.0,Device is OK
2023-12-12 11:05:36,V9 VVB001 Fan Motor,0.1,101.971621,30.591486,3.5,30.0,Device is OK


In [34]:
# save to csv file
combined_df1.to_csv(r"C:\Users\LENOVO\1. Projects\Bodycode-Anomaly-Detection-Project.\artifacts\data_ingestion\data\clean_df.csv", index=True)

In [35]:
# assign new_df to be the following column fatigue_mmps	impact_mg0	friction_mg0	crest	Temperature_C
combined_df = df[['fatigue_mmps', 'impact_mg0', 'friction_mg0', 'crest', 'Temperature_C']]
combined_df.shape

(160550, 5)

In [36]:
combined_df

Unnamed: 0_level_0,fatigue_mmps,impact_mg0,friction_mg0,crest,Temperature_C
TimeStamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-10-27 08:29:03,0.0,20.394324,10.197162,3.7,22.5
2023-10-27 08:30:13,0.0,30.591486,10.197162,4.3,22.5
2023-10-27 08:30:23,0.0,20.394324,10.197162,4.0,22.5
2023-10-27 08:30:33,0.0,30.591486,10.197162,4.7,22.5
2023-10-27 08:31:53,0.0,20.394324,10.197162,3.9,22.5
...,...,...,...,...,...
2023-12-12 11:05:13,0.1,101.971621,30.591486,3.3,30.0
2023-12-12 11:05:15,0.1,101.971621,30.591486,3.4,30.0
2023-12-12 11:05:25,0.1,101.971621,30.591486,3.3,30.0
2023-12-12 11:05:36,0.1,101.971621,30.591486,3.5,30.0
