In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load the dataset
file_path = 'data/data.csv'
df = pd.read_csv(file_path)

# Display first few rows
df.head()

Unnamed: 0,timestamp,equipment_energy_consumption,lighting_energy,zone1_temperature,zone1_humidity,zone2_temperature,zone2_humidity,zone3_temperature,zone3_humidity,zone4_temperature,...,zone9_temperature,zone9_humidity,outdoor_temperature,atmospheric_pressure,outdoor_humidity,wind_speed,visibility_index,dew_point,random_variable1,random_variable2
0,2016-01-11 17:00:00,60.0,-77.78778596503064,33.74660933896648,47.59666666666671,19.2,44.79,19.79,,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60.0,30.0,19.89,46.69333333333329,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,35.921144,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50.0,40.0,33.74660933896648,46.0666666666667,19.2,44.59,19.79,45.0,,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,37.673716,45.410389,45.410389
4,2016-01-11 17:40:00,60.0,40.0,19.89,46.33333333333329,19.2,44.53,19.79,45.0,18.89,...,4.476511,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


# Exploratory data analysis (EDA)

In [4]:
# Check shape of the dataset
print("Dataset Shape:", df.shape)

# Display data types and non-null counts
df.info()

# Basic statistics
df.describe().T

Dataset Shape: (16857, 29)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16857 entries, 0 to 16856
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   timestamp                     16857 non-null  object 
 1   equipment_energy_consumption  16013 non-null  object 
 2   lighting_energy               16048 non-null  object 
 3   zone1_temperature             15990 non-null  object 
 4   zone1_humidity                16056 non-null  object 
 5   zone2_temperature             16004 non-null  object 
 6   zone2_humidity                15990 non-null  float64
 7   zone3_temperature             16055 non-null  float64
 8   zone3_humidity                15979 non-null  float64
 9   zone4_temperature             16041 non-null  float64
 10  zone4_humidity                16076 non-null  float64
 11  zone5_temperature             16019 non-null  float64
 12  zone5_humidity                160

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
zone2_humidity,15990.0,39.494553,10.129513,-77.265503,37.7575,40.293333,43.0,77.265503
zone3_temperature,16055.0,21.665733,2.594309,6.543921,20.533333,21.7675,22.76,36.823982
zone3_humidity,15979.0,38.201314,10.144388,-71.406273,36.5925,38.4,41.433333,71.406273
zone4_temperature,16041.0,20.239922,2.78305,4.613485,19.266667,20.29,21.356667,35.921144
zone4_humidity,16076.0,37.945608,10.769813,-81.446225,35.2,38.09,41.560833,81.446225
zone5_temperature,16019.0,19.052613,2.346158,5.921094,18.061111,19.05,20.1,32.157594
zone5_humidity,16056.0,50.289131,18.722516,-141.640143,45.29,48.854429,53.918333,141.640143
zone6_temperature,16009.0,6.469934,8.867993,-42.987365,2.93,6.263333,9.69,55.932271
zone6_humidity,16010.0,59.162913,52.657787,-353.393026,37.066667,62.766667,86.59,353.393026
zone7_temperature,16063.0,19.672472,2.879815,3.578021,18.5,19.6,21.0,35.712952


In [8]:
# Convert all object columns (except timestamp) to numeric
for col in df.columns:
    if df[col].dtype == 'object' and col != 'timestamp':
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [9]:
df.duplicated().sum()

np.int64(68)

In [10]:
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)


After removing duplicates: (16789, 29)


In [11]:
# Count missing values in each column
df.isnull().sum()

timestamp                         0
equipment_energy_consumption    909
lighting_energy                 861
zone1_temperature               944
zone1_humidity                  886
zone2_temperature               918
zone2_humidity                  863
zone3_temperature               798
zone3_humidity                  877
zone4_temperature               811
zone4_humidity                  775
zone5_temperature               832
zone5_humidity                  797
zone6_temperature               842
zone6_humidity                  841
zone7_temperature               787
zone7_humidity                  800
zone8_temperature               843
zone8_humidity                  776
zone9_temperature               771
zone9_humidity                  886
outdoor_temperature             802
atmospheric_pressure            841
outdoor_humidity                794
wind_speed                      822
visibility_index                809
dew_point                       823
random_variable1            

In [12]:
# Fill remaining missing values with column-wise mean
df.fillna(df.mean(numeric_only=True), inplace=True)

In [13]:
# Confirm no missing values remain
print(df.isnull().sum().sum())

0


In [14]:
df.head()

Unnamed: 0,timestamp,equipment_energy_consumption,lighting_energy,zone1_temperature,zone1_humidity,zone2_temperature,zone2_humidity,zone3_temperature,zone3_humidity,zone4_temperature,...,zone9_temperature,zone9_humidity,outdoor_temperature,atmospheric_pressure,outdoor_humidity,wind_speed,visibility_index,dew_point,random_variable1,random_variable2
0,2016-01-11 17:00:00,60.0,-77.787786,33.746609,47.596667,19.2,44.79,19.79,38.197731,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60.0,30.0,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50.0,30.0,19.89,46.3,19.2,44.626667,19.79,44.933333,35.921144,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50.0,40.0,33.746609,46.066667,19.2,44.59,19.79,45.0,20.240384,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,37.673716,45.410389,45.410389
4,2016-01-11 17:40:00,60.0,40.0,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,4.476511,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097
