In [1]:
#Libreries

import pandas as pd

## Loading Dataset

In [2]:
dataPath = "../data/raw/household_power_consumption.txt"
dataset = pd.read_csv(dataPath, sep=';', na_values=['nan','?'], low_memory=False)

dataset.head(5)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    float64
 3   Global_reactive_power  float64
 4   Voltage                float64
 5   Global_intensity       float64
 6   Sub_metering_1         float64
 7   Sub_metering_2         float64
 8   Sub_metering_3         float64
dtypes: float64(7), object(2)
memory usage: 142.5+ MB


### Convert to Datatime formatting

In [4]:
dataset['Datetime'] = pd.to_datetime(dataset['Date'] + ' ' + dataset['Time'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Establecer 'Datetime' como índice
dataset.set_index('Datetime', inplace=True)

# Eliminar columnas originales si ya no son necesarias
dataset.drop(columns=['Date', 'Time'], inplace=True)

In [5]:
dataset.head(5)

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


#### Null Values

In [6]:
# Null Values
print("\nNull Values:")
print(dataset.isnull().sum())


Null Values:
Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64


In [7]:
dataset.dropna(inplace=True)

In [8]:
# Null Values
print("\nNull Values:")
print(dataset.isnull().sum())



Null Values:
Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64


### Calculation of Unmeasured Active Energy Consumption

The active energy consumed by other unmeasured electrical equipment is calculated using the following formula:

```
(global_active_power * 1000 / 60) - sub_metering_1 - sub_metering_2 - sub_metering_3
```

Where:
- `global_active_power` is in kilowatts (kW).
- Multiplying by 1000 converts kW to watts (W).
- Dividing by 60 estimates the energy consumed per minute.
- `sub_metering_1`, `sub_metering_2`, and `sub_metering_3` represent the energy consumption in watt hour (W/h) of three measured subcircuits.


In [10]:
dataset['unmeasured'] = (dataset['Global_active_power'] * 1000 / 60) - dataset['Sub_metering_1'] - dataset['Sub_metering_2'] - dataset['Sub_metering_3']
dataset.head(5)

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,unmeasured
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0,52.266667
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0,72.333333
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0,70.566667
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0,71.8
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0,43.1


### Saving processed dataset

In [11]:
savePath = "../data/processed/pro_household_power_consumption.csv"
dataset.to_csv(savePath)