# Data Standardisation

Combining AED and data Wrangling, at the end the csv produced should be a standardised dataset ready for use.

In [83]:
# Import libraries
import sys
import os
import pandas as pd



sys.path.append('../')
os.getcwd()
from core.data_manager import csv_to_dataframe


In [84]:
# Variables
from os import path


path_water = "../data_storage/water.csv"
path_power = "../data_storage/power.csv"
path_gas = "../data_storage/gas.csv"


In [85]:
water_df = csv_to_dataframe(path_water, index_col=False)
gas_df = csv_to_dataframe(path_gas, index_col=False)
power_df = csv_to_dataframe(path_power, index_col=False)

In [86]:
water_df


Unnamed: 0,date_eom,date,days,id_meter,water_m3,cons,av_day
0,2011-06-30,2011-06-01,0.0,6,1.000,0.000,
1,2011-07-31,2011-07-12,41.0,6,10.000,9.000,0.219512
2,2011-08-31,,,6,,,
3,2011-09-30,,,6,,,
4,2011-10-31,,,6,,,
...,...,...,...,...,...,...,...
184,2024-02-29,2024-02-03,33.0,3,389.949,9.039,0.273909
185,2024-03-31,2024-03-02,28.0,3,397.351,7.402,0.264357
186,2024-04-30,2024-04-07,36.0,3,407.061,9.710,0.269722
187,2024-05-31,2024-05-01,24.0,3,413.556,6.495,0.270625


In [87]:
# Function to sort data based in the id_meter, and then in the date
def sort_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(by=['id_meter', 'date'])
    # if date empty fill with date_eom
    if 'date_eom' in df.columns:
        df['date'] = df['date'].fillna(df['date_eom'])
        df = df.drop(columns=['date_eom'])
    # convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['id_meter', 'date'])
    # day_dif column as the difference between the date and the previous date
    df['days'] = df['date'].diff().dt.days
    # # replace 'days' by 'days_dif' and drop 'days_dif' column
    # df['days'] = df['days'].fillna(df['day_dif'])
    # df = df.drop(columns=['day_dif'])
    # if av_day fill with next value
    df['av_day'] = df['av_day'].bfill()

    return df

water_df = sort_data(water_df)
gas_df = sort_data(gas_df)
power_df = sort_data(power_df)

In [91]:
# Function resampling independtly for each meter, droping the columns ['av_day', 'days'], and interpolating the data

def resample_data(df: pd.DataFrame, freq: str, name: str) -> pd.DataFrame :
    df_temp = df.copy()
    # split the data in as many dataframes as meters
    df_temp = df_temp.groupby('id_meter')
    # create a list of dataframes with the id_meter as key
    df_temp = [df_temp.get_group(x) for x in df_temp.groups]
    # convert the list of dataframes to independent dataframes
    for i in range(len(df_temp)):
        df_temp[i] = pd.DataFrame(df_temp[i])
    # resample each dataframe
    for i in range(len(df_temp)):
        # drop unnecessary columns ['av_day', 'days', 'water_m3', 'gas_m3', 'power_kwh', or 'cons']
        columns_drop = ['days', 'av_day', 'cons']
        columns_drop = [col for col in columns_drop if col in df_temp[i].columns]
        df_temp[i] = df_temp[i].drop(columns=columns_drop)
        df_temp[i]['date'] = pd.to_datetime(df_temp[i]['date'])
        df_temp[i] = df_temp[i].set_index('date')
        df_temp[i] = df_temp[i].resample(freq).mean()
        df_temp[i] = df_temp[i].interpolate(method='time')
        df_temp[i] = df_temp[i].reset_index()
        # create a new column [calc_cons] with the difference between the previous and the current value of the third column in the dataframe without using the name of the column
        df_temp[i]['calc_cons'] = df_temp[i].iloc[:, 2].diff() 
        print("Dataframe shape: ", df_temp[i].shape)
    # merge the dataframes
    df_temp = pd.concat(df_temp)
    # sort the data by id_meter and date
    df_temp = df_temp.sort_values(by=['id_meter', 'date'])
    # save the df in a csv file with the name 'resampled_data'+freq+df_name at the data_storage folder
    df_temp.to_csv('../data_storage/resampled_data_'+freq+'_'+name+'.csv', index=False)
    return df_temp

interpolated_water_df = resample_data(water_df, 'ME', 'water')
interpolated_gas_df = resample_data(gas_df, 'ME', 'gas')
interpolated_power_df = resample_data(power_df, 'ME', 'power')


Dataframe shape:  (66, 4)
Dataframe shape:  (53, 4)
Dataframe shape:  (41, 4)
Dataframe shape:  (118, 4)
Dataframe shape:  (41, 4)
Dataframe shape:  (79, 4)
Dataframe shape:  (40, 4)
Dataframe shape:  (41, 4)


In [81]:
print(interpolated_water_df)

         date  id_meter    water_m3  calc_cons
0  2014-09-04         2  114.418000        NaN
1  2014-09-12         2  117.017153   2.599153
2  2014-11-28         2  142.034000  25.016847
3  2014-12-16         2  148.274000   6.240000
4  2015-01-07         2  155.073787   6.799787
..        ...       ...         ...        ...
35 2014-05-31         6  306.891192   7.870466
36 2014-06-28         6  314.000000   7.108808
37 2014-07-31         6  321.218750   7.218750
38 2014-08-31         6  328.000000   6.781250
39 2014-10-02         6  335.000000   7.000000

[189 rows x 4 columns]
