In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from geopy import distance
from csv import reader
from sklearn.metrics import r2_score
from datetime import datetime
from datetime import timedelta as tdelta
from datetime import time as time
from helpe_funcs import *


## I. Data import ##

### I.a) Import positions ###

In [None]:
csv_file_name = 'sensor_positions.csv'
coords = []
IDs = []

with open('./data/' + csv_file_name, 'r') as read_obj:
    csv_reader = reader(read_obj)
    for row in csv_reader:
        coord = (float(row[0].split(' ')[0].replace('(', '')), 
                 float(row[0].split(' ')[1].replace(')', '')) )
        coords.append(coord)
        
        IDs.append(row[1])

stations_df = pd.DataFrame(data={'coords': coords, 'IDs': IDs})
stations_df.drop_duplicates(subset='IDs', keep='first', inplace=True)
stations_df.set_index('IDs', inplace=True)

IDs = list(stations_df.index)

del csv_file_name, coords, csv_reader, read_obj, row

### I.b) Form groups based on distance between stations ###

In [None]:
groups = make_groups(IDs, stations_df)
stations_df.insert(loc=1, column='grps', value=groups['grp'])

del groups

### I.c) Import sound pressure values ###

In [4]:
dir_path = './data'
csv_file_names = []

for path in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, path)):
        if path.find('data.csv') != -1:
            csv_file_names.append(path)
            
del path, dir_path

Append all csv files to a list of dataframes and add the id of the sensor to distinguish them later on

In [5]:
list_df = []
sensor_names = []


for filename in csv_file_names:
    df = pd.read_csv('./data/' + filename, index_col=None, header=0)
    df['Time'] = df.apply(lambda row: datetime.strptime(row['Time'], '%Y-%m-%d %H:%M:%S'), axis=1)
    sensor_names.append(filename[12:16]);
    list_df.append(df);

list_len = [len(df) for df in list_df];

del filename, csv_file_names

### I.d) Get rid of duplicates and fill in missing timestamps ###
-> data is transformed into uniformely sampled data with `np.nan()` in place of missing values
-> start date is 2022.02.20 00:00:00, stop date is 2022.03.05 00:00:00
-> all data is organised into a DataFrame, columns=Sensor IDs, index=Timestamps

In [6]:
start_time = datetime.strptime('2022.02.20 00:00:00', '%Y.%m.%d %H:%M:%S')
end_time = datetime.strptime('2022.03.05 00:00:00', '%Y.%m.%d %H:%M:%S')
tindex = pd.date_range(start_time, end_time, freq='1min')
data_label='dt_sound_level_dB'

df_data_incomplete = pd.DataFrame(index=tindex, columns=sensor_names)

list_df_incomplete = []

idx = 0
for df in list_df:
    # get rid of redundant datapoints
    df = df[df.Time >= start_time]
    df = df[df.Time <= end_time]
    df.drop_duplicates(subset='Time', keep='first', inplace=True)
    
    # index data by Time
    df.index = pd.to_datetime(df['Time'])
    df.drop(columns=['Time'], inplace=True)
    df = df.reindex(tindex)
    
    list_df_incomplete.append(df)
    df_data_incomplete[sensor_names[idx]] = df[data_label]
    idx = idx + 1

del df

## II. Interpolation ##

### II.a) Simple interpolation ###
-> 'nearest'
-> 'linear'

In [None]:
#interpolate data using 'nearest' method
list_df_other = []
list_df_linear = []
list_df_nearest = []


for df in list_df_incomplete:
    temp_df = df.interpolate(method='nearest')
    temp_df = temp_df.interpolate(limit_area=None, method='backfill')  
    df['int_nearest'] = temp_df['dt_sound_level_dB']
    list_df_nearest.append(temp_df)
    
for df in list_df_incomplete:
    temp_df = df.interpolate(method='linear')
    # linear not possible backwards -> backfill
    temp_df = temp_df.interpolate(limit_area=None, method='backfill') 
    df['int_linear'] = temp_df['dt_sound_level_dB']
    df.apply(lambda x: np.round(x['int_linear'], 1), 1)
    list_df_linear.append(temp_df)

In [None]:
corrs = []

for id in sensor_names:
    # 1) extract data from "friends"
    group = stations_df.loc[id]['grps']
    group_df = data_df.loc[group]['incomplete']
    single_df = data_df.loc[id]['incomplete']
    for fid in group[1:len-2]:
        corrs.append(calculate_correlations(single_df, group_df[fid]))
        

### II.b) Attempts on advanced interpolation ###

#### II.b.1) Raw data separation and filtering ####
-> data is separated into 2 groups: workdays(`df_work_...`), weekenddays(`df_end_...`)
-> data is further filtered (meaned) to create average workday and average weekend day

In [68]:
day_str = datetime.strptime('2023.05.01 00:00:00', '%Y.%m.%d %H:%M:%S')
day_stp = datetime.strptime('2023.05.01 23:59:59', '%Y.%m.%d %H:%M:%S')
daindex = pd.date_range(day_str, day_stp, freq='1min')

df_work_mean = pd.DataFrame(index=daindex, columns=sensor_names)
df_end_mean = pd.DataFrame(index=daindex, columns=sensor_names)

df_work_all = df_data_incomplete.loc[df_data_incomplete.index.day_of_week < 5].copy()
df_end_all = df_data_incomplete.loc[df_data_incomplete.index.day_of_week > 4].copy()

df_mean_ww = df_work_all.copy()
df_mean_we = df_end_all.copy()

for moment in daindex:
    idxs_work = df_work_all.index.indexer_at_time(moment.time())
    idxs_end = df_end_all.index.indexer_at_time(moment.time())
    
    df_work_mean.loc[moment] = df_work_all.iloc[idxs_work].mean()
    df_end_mean.loc[moment] = df_end_all.iloc[idxs_end].mean()
    
    df_mean_ww.iloc[idxs_work] = df_work_mean.loc[moment]
    df_mean_we.iloc[idxs_end] = df_end_mean.loc[moment]
    
df_mean = pd.concat([df_mean_we, df_mean_ww], axis=0)
df_mean.sort_index(inplace=True)

del day_stp, day_str


#### II.b.2) Determine resampling period ####

Determine resampling period based on rmse=f(Tresample)

1) for mean interpolation
2) for linear interpolation

The data is downsampled for sampling periods of [1, 2, 5, 10, 15, 20, 30, 60] minutes and rmse is calculated to provide an esimate for information loss

1) mean interpolation - simple resampling

In [None]:
resample_pers = [1, 2, 5, 10, 15, 20, 30, 60]

list_rmses = []
    
resample_vars_d = pd.DataFrame(data=np.zeros([len(sensor_names), len(resample_pers)]), columns=resample_pers, index=sensor_names)
resample_vars_e = resample_vars_d.copy()
resample_vars_n = resample_vars_d.copy()

df_resvar_d = df_data_incomplete.iloc[indexer_day(df_data_incomplete)]
df_resvar_e = df_data_incomplete.iloc[indexer_evening(df_data_incomplete)]
df_resvar_n = df_data_incomplete.iloc[indexer_night(df_data_incomplete)]

for per in resample_pers:
    resample_vars_d[per] = df_resvar_d.resample(str(per)+'T').rmse().mean()
    resample_vars_e[per] = df_resvar_e.resample(str(per)+'T').rmse().mean()
    resample_vars_n[per] = df_resvar_n.resample(str(per)+'T').rmse().mean()
 
resample_vars_d[1]=np.zeros(len(sensor_names)) 
resample_vars_n[1]=np.zeros(len(sensor_names)) 
resample_vars_e[1]=np.zeros(len(sensor_names))  

list_rmses.append([resample_vars_d.mean(),
                   resample_vars_e.mean(),
                   resample_vars_n.mean()])



fig, ax = plt.subplots()
ax = init_ax_resamp(ax, resample_pers, )
plt.show()  

del  resample_vars_d, resample_vars_n, resample_vars_e, resample_pers
    

1) mean interpolation - average days

In [None]:
df_orig_d = df_data_incomplete.iloc[indexer_day(df_data_incomplete)]
df_orig_e = df_data_incomplete.iloc[indexer_evening(df_data_incomplete)]
df_orig_n = df_data_incomplete.iloc[indexer_night(df_data_incomplete)]

df_mean_d = df_mean.iloc[indexer_day(df_mean)]
df_mean_e = df_mean.iloc[indexer_evening(df_mean)]
df_mean_n = df_mean.iloc[indexer_night(df_mean)]

df_resamp_d = pd.DataFrame()
df_resamp_e = pd.DataFrame()
df_resamp_n = pd.DataFrame()

for per in resample_pers:
    df_mean_d[per] = df_mean_d.resample(str(per)+'T').mean()
    df_mean_e[per] = df_mean_e.resample(str(per)+'T').mean()
    df_mean_n[per] = df_mean_n.resample(str(per)+'T').mean()
 
resample_vars_d[1]=np.zeros(len(sensor_names)) 
resample_vars_n[1]=np.zeros(len(sensor_names)) 
resample_vars_e[1]=np.zeros(len(sensor_names))  

list_rmses.append([resample_vars_d.mean(),
                   resample_vars_e.mean(),
                   resample_vars_n.mean()])


# TESTING #

In [64]:
test = df_mean_ww.merge(df_mean_we, how='cross', left_index=True, right_index=True)


MergeError: Can not pass on, right_on, left_on or set right_index=True or left_index=True

In [None]:
for df in [ensemble1, ensemble2]:
    mean_list = []
    median_list = []
    std_list = []

    for size in sample_sizes:
        sample_size = int(size * len(df))
        sample = df.sample(n=sample_size, replace=False)
        mean_list.append(np.mean(sample['mean']))
        median_list.append(np.median(sample['mean']))
        std_list.append(np.std(sample['mean']))

    means.append(mean_list)
    medians.append(median_list)
    stds.append(std_list)

fig, ax = plt.subplots(3, figsize=(10, 10))

for axis in range(3):
    for i, name in enumerate(['ensemble1', 'ensemble2']):
        ax[axis].plot(sample_sizes, stat_data[axis][i], label=name)

    ax[axis].set_title(stats[axis])
    ax[axis].set_xlabel('sample percentage')

fig.tight_layout()
plt.legend()
plt.show()