# Data Standardisation

Combining AED and data Wrangling, at the end the csv produced should be a standardised dataset ready for use.

In [None]:
# Import libraries
import sys
import os



sys.path.append('../')
os.getcwd()
from core import data_manager


In [170]:
# Variables
from os import path


path_water = "../data_storage/water.csv"
path_power = "../data_storage/power.csv"
path_gas = "../data_storage/gas.csv"


In [171]:
water_df = csv_to_dataframe(path_water, index_col=False)
gas_df = csv_to_dataframe(path_gas, index_col=False)
power_df = csv_to_dataframe(path_power, index_col=False)

In [174]:
water_df


Unnamed: 0,date,days,id_meter,water_m3,cons,av_day
39,2014-09-04,,2,114.418,0.000,0.000000
40,2014-09-12,8.0,2,,,0.324894
42,2014-11-28,77.0,2,142.034,27.616,0.324894
43,2014-12-16,18.0,2,148.274,6.240,0.346667
44,2015-01-07,22.0,2,,,0.309081
...,...,...,...,...,...,...
35,2014-05-31,31.0,6,,,0.253886
36,2014-06-28,28.0,6,314.000,98.000,0.253886
37,2014-07-31,33.0,6,,,0.218750
38,2014-08-31,31.0,6,,,0.218750


In [173]:
# Function to sort data based in the id_meter, and then in the date
def sort_data(df: pd.DataFrame):
    df = df.sort_values(by=['id_meter', 'date'])
    # if date empty fill with date_eom
    if 'date_eom' in df.columns:
        df['date'] = df['date'].fillna(df['date_eom'])
        df = df.drop(columns=['date_eom'])
    # convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['id_meter', 'date'])
    # day_dif column as the difference between the date and the previous date
    df['days'] = df['date'].diff().dt.days
    # # replace 'days' by 'days_dif' and drop 'days_dif' column
    # df['days'] = df['days'].fillna(df['day_dif'])
    # df = df.drop(columns=['day_dif'])
    # if av_day fill with next value
    df['av_day'] = df['av_day'].bfill()

    return df

water_df = sort_data(water_df)
gas_df = sort_data(gas_df)
power_df = sort_data(power_df)

In [186]:
# Function to resample column "cons" to a daily or montly frequency by meter_id and  interpolate the missing values (except for column av_day)
def resample_data(df: pd.DataFrame, freq: str) :
    df_temp = df.copy()
    # drop unnecessary columns ['av_day', 'days', 'water_m3', 'gas_m3', 'power_kwh', or 'cons']
    columns_drop = ['days', 'av_day', 'cons']
    columns_drop = [col for col in columns_drop if col in df_temp.columns]
    df_temp = df_temp.drop(columns=columns_drop)
    # drop av_day column
    if 'av_day' in df_temp.columns:
        df_temp = df_temp.drop(columns=['av_day'])
    df_temp['date'] = pd.to_datetime(df_temp['date'])
    df_temp = df_temp.set_index('date')
    df_temp = df_temp.groupby('id_meter').resample(freq).mean()
    df_temp = df_temp.interpolate(method='linear')
    df_temp = df_temp.drop(columns=['id_meter'])
    df_temp = df_temp.reset_index()
    # merge df_temp with the original df to keep the av_day column
    df_temp = pd.concat([df_temp, df[['av_day', 'cons']]], axis=1)
    return df_temp

interpolated_water_df = resample_data(water_df, 'D')

In [187]:
interpolated_water_df

Unnamed: 0,id_meter,date,water_m3,av_day,cons
0,2,2014-09-04,114.418000,0.219512,0.0
1,2,2014-09-05,114.742894,0.219512,9.0
2,2,2014-09-06,115.067788,0.267760,
3,2,2014-09-07,115.392682,0.267760,
4,2,2014-09-08,115.717576,0.267760,
...,...,...,...,...,...
4775,6,2014-09-28,334.125000,,
4776,6,2014-09-29,334.343750,,
4777,6,2014-09-30,334.562500,,
4778,6,2014-10-01,334.781250,,
