## Predicting Manufacturing Efficiency Using Sensor and Network Data

### Author: Eric Meyer

### Capstone Project – MS in Data Analytics, Northwest Missouri State University

In [1]:
import pandas as pd

### Check for data summary and description stats

In [6]:
# Load the dataset
file_path = "manufacturing_6G_dataset.csv"
df = pd.read_csv(file_path)

# Display basic info
df_info = df.info()

# Get shape and first few rows
df_shape = df.shape
df_head = df.head()

# Get column names and data types
df_columns = df.dtypes

# Get summary statistics
df_summary = df.describe(include='all')

df_shape, df_head, df_columns, df_summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 13 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Timestamp                      100000 non-null  object 
 1   Machine_ID                     100000 non-null  int64  
 2   Operation_Mode                 100000 non-null  object 
 3   Temperature_C                  100000 non-null  float64
 4   Vibration_Hz                   100000 non-null  float64
 5   Power_Consumption_kW           100000 non-null  float64
 6   Network_Latency_ms             100000 non-null  float64
 7   Packet_Loss_%                  100000 non-null  float64
 8   Quality_Control_Defect_Rate_%  100000 non-null  float64
 9   Production_Speed_units_per_hr  100000 non-null  float64
 10  Predictive_Maintenance_Score   100000 non-null  float64
 11  Error_Rate_%                   100000 non-null  float64
 12  Efficiency_Status              

((100000, 13),
              Timestamp  Machine_ID Operation_Mode  Temperature_C  \
 0  2024-01-01 00:00:00          39           Idle      74.137590   
 1  2024-01-01 00:01:00          29         Active      84.264558   
 2  2024-01-01 00:02:00          15         Active      44.280102   
 3  2024-01-01 00:03:00          43         Active      40.568502   
 4  2024-01-01 00:04:00           8           Idle      75.063817   
 
    Vibration_Hz  Power_Consumption_kW  Network_Latency_ms  Packet_Loss_%  \
 0      3.500595              8.612162           10.650542       0.207764   
 1      3.355928              2.268559           29.111810       2.228464   
 2      2.079766              6.144105           18.357292       1.639416   
 3      0.298238              4.067825           29.153629       1.161021   
 4      0.345810              6.225737           34.029191       4.796520   
 
    Quality_Control_Defect_Rate_%  Production_Speed_units_per_hr  \
 0                       7.751261    

### Check for how clean data is

In [7]:
# Shape of dataset
print("Rows and columns:", df.shape)

# Missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Percentage of missing data
missing_percent = df.isnull().mean() * 100
print("\n% Missing values per column:")
print(missing_percent)

# Duplicate rows
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Data types
print("\nData types:")
print(df.dtypes)

# Summary statistics (for numeric columns)
print("\nDescriptive statistics:")
print(df.describe())

# Unique values (helps find inconsistency in categorical columns)
print("\nUnique values per column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")

Rows and columns: (100000, 13)

Missing values per column:
Timestamp                        0
Machine_ID                       0
Operation_Mode                   0
Temperature_C                    0
Vibration_Hz                     0
Power_Consumption_kW             0
Network_Latency_ms               0
Packet_Loss_%                    0
Quality_Control_Defect_Rate_%    0
Production_Speed_units_per_hr    0
Predictive_Maintenance_Score     0
Error_Rate_%                     0
Efficiency_Status                0
dtype: int64

% Missing values per column:
Timestamp                        0.0
Machine_ID                       0.0
Operation_Mode                   0.0
Temperature_C                    0.0
Vibration_Hz                     0.0
Power_Consumption_kW             0.0
Network_Latency_ms               0.0
Packet_Loss_%                    0.0
Quality_Control_Defect_Rate_%    0.0
Production_Speed_units_per_hr    0.0
Predictive_Maintenance_Score     0.0
Error_Rate_%                     0.0