Create a SVD low-rank model of the noise data, for each of the three time periods. The model should be based on the whole time period, and the results reported for each time period. Compare this with a "global model" which is based on the ensemble average of all stations over each of the three time periods. In additional, compare the global model to ensemble averages and SVD low-rank models for each of the four regions. In total, you will have five resulting models: one global model with all regions + four regional models using station data from each region.

Time periods:
Day: 07:00-19:00
Evening: 19:00-23:00
Night: 23:00-07:00

In [8]:
import pandas as pd
import os
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [9]:
dir_path = './data/'
folders = ['region_1_mustamäe_kristiine', 'region_2_data_kesklinn', 'region_3_kadriorg_lasnamäe', 'region_4_ülemiste']

list_df = []
sensor_names = []
region_list = []

for folder in folders:
    for file in os.listdir(dir_path + folder):
        sensor_name = file.split('-')[0]
        region_list.append(int(folder.split('_')[1]))
        sensor_names.append(file.split('-')[0])
        df = pd.read_csv(dir_path + folder + "/" + file, index_col=None, header=0)
        df['Time'] = df.apply(lambda row: datetime.strptime(row['Time'], '%Y-%m-%d %H:%M:%S'), axis=1)
        df.rename(columns={'dt_sound_level_dB': sensor_name}, inplace=True)
        list_df.append(df)

In [10]:
start_time = datetime.strptime('2022.08.01 00:00:00', '%Y.%m.%d %H:%M:%S')
end_time = datetime.strptime('2022.08.13 23:59:00', '%Y.%m.%d %H:%M:%S')

df_counter = 0

for df in list_df:
    df = df[df.Time >= start_time]
    df = df[df.Time <= end_time]

    start_row = pd.DataFrame({'Time': start_time, 'dt_sound_level_dB': df.iloc[0][sensor_names[df_counter]]}, index=[0])
    start_row['Time'] = pd.to_datetime(start_row['Time'])

    end_row = pd.DataFrame({'Time': end_time, sensor_names[df_counter]: df.iloc[-1][sensor_names[df_counter]]}, index=[0])
    end_row['Time'] = pd.to_datetime(end_row['Time'])

    df['Time'] = pd.to_datetime(df['Time'])
    df = df.set_index('Time')

    if not (df.index == end_time).any():
        df.loc[end_time] = [df.iloc[-1][sensor_names[df_counter]]]

    if not (df.index == start_time).any():
        df.loc[start_time] = [df.iloc[0][sensor_names[df_counter]]]

    df = df.sort_values(by='Time')
    df_counter += 1
    print(df.head())
    print(df.tail())
    print("--------------------------------------------------")


                     2004
Time                     
2022-08-01 00:00:00    54
2022-08-01 00:01:00    54
2022-08-01 00:03:00    58
2022-08-01 00:05:00    54
2022-08-01 00:07:00    56
                     2004
Time                     
2022-08-13 23:52:00    60
2022-08-13 23:54:00    56
2022-08-13 23:56:00    53
2022-08-13 23:58:00    57
2022-08-13 23:59:00    57
--------------------------------------------------
                     2018
Time                     
2022-08-01 00:00:00    54
2022-08-01 00:02:00    53
2022-08-01 00:04:00    54
2022-08-01 00:06:00    55
2022-08-01 00:08:00    55
                     2018
Time                     
2022-08-13 23:29:00    60
2022-08-13 23:31:00    56
2022-08-13 23:33:00    60
2022-08-13 23:35:00    59
2022-08-13 23:59:00    58
--------------------------------------------------
                     201D
Time                     
2022-08-01 00:00:00    52
2022-08-01 00:18:00    52
2022-08-01 00:20:00    53
2022-08-01 00:22:00    53
2022-08-01 00:

In [11]:
import datetime

dt = start_time
step = datetime.timedelta(minutes=1)
result = []

while dt < end_time:
    result.append(dt.strftime('%Y.%m.%d %H:%M:%S'))
    dt += step

col = {'Time': result, 'dummy': 0}
df_all_timestamps = pd.DataFrame(data=col)
df_all_timestamps['Time'] = pd.to_datetime(df_all_timestamps['Time'])
df_all_timestamps = df_all_timestamps.set_index('Time')

print(df_all_timestamps.head())

                     dummy
Time                      
2022-08-01 00:00:00      0
2022-08-01 00:01:00      0
2022-08-01 00:02:00      0
2022-08-01 00:03:00      0
2022-08-01 00:04:00      0


In [12]:
list_df_inter = []
df_counter = 0
df_final = df_all_timestamps.drop('dummy', axis=1)

for df in list_df:
    df = df_all_timestamps.merge(df, how='left', on='Time')
    df = df.drop('dummy', axis=1)

    while df[sensor_names[df_counter]].isnull().values.any():
        df = df.interpolate(method='bfill', limit=1)
        df = df.interpolate(method='ffill', limit=1)

    df = df.set_index('Time')
    df = df.resample('1T').mean()
    df_counter += 1
    # df_final = df_final.merge(df, how='left', on='Time')
    pd.concat((df_final,df),axis=0)
    print(df.head())
    print(df.tail())
    print("--------------------------------------------------")

del df_counter, df_all_timestamps, list_df, dt, start_time, end_time

                     2004
Time                     
2022-08-01 00:00:00  54.0
2022-08-01 00:01:00  54.0
2022-08-01 00:02:00  58.0
2022-08-01 00:03:00  58.0
2022-08-01 00:04:00  54.0
                     2004
Time                     
2022-08-13 23:54:00  56.0
2022-08-13 23:55:00  53.0
2022-08-13 23:56:00  53.0
2022-08-13 23:57:00  57.0
2022-08-13 23:58:00  57.0
--------------------------------------------------
                     2018
Time                     
2022-08-01 00:00:00  54.0
2022-08-01 00:01:00  53.0
2022-08-01 00:02:00  53.0
2022-08-01 00:03:00  54.0
2022-08-01 00:04:00  54.0
                     2018
Time                     
2022-08-13 23:54:00  59.0
2022-08-13 23:55:00  59.0
2022-08-13 23:56:00  59.0
2022-08-13 23:57:00  59.0
2022-08-13 23:58:00  59.0
--------------------------------------------------
                     201D
Time                     
2022-08-01 00:00:00  52.0
2022-08-01 00:01:00  52.0
2022-08-01 00:02:00  52.0
2022-08-01 00:03:00  52.0
2022-08-01 00:

In [None]:
print(df_final.head())
print(df_final.tail())

In [13]:
def calc_SVD_RMSE(df):
    U, s, Vt = np.linalg.svd(df)

    rank_range = range(1, len(df.columns) + 1)
    rmse_list = []
    amount_sing_values = []

    for rank in rank_range:
        Sigma = np.zeros((df.shape[0], df.shape[1]))
        Sigma[:rank, :rank] = np.diag(s[:rank])
        df_recon = U.dot(Sigma.dot(Vt))

        rmse = np.sqrt(np.mean((df - df_recon)**2))
        rmse_list.append(rmse.mean())
        amount_sing_values.append(Sigma[rank-1][rank-1])

    return rmse_list, amount_sing_values

In [14]:
df_final.reset_index(inplace=True)
df_without_timestamp = df_final.iloc[:, 1:len(df_final.columns)+1]

#rmse_list, amount_sing_values = calc_SVD_RMSE(df_without_timestamp)

for rank in range(len(rmse_list)):
        print(f'Rank: {rank+1} RMSE: {rmse_list[rank]} Singular Values: {amount_sing_values[rank]}')

NameError: name 'rmse_list' is not defined

In [None]:
df_without_timestamp_sub = df_without_timestamp.sub(df_without_timestamp.mean(axis=1), axis=0)

In [None]:
rmse_list_sub, amount_sing_values_sub = calc_SVD_RMSE(df_without_timestamp_sub)

for rank in range(len(rmse_list_sub)):
    print(f'Rank: {rank+1} RMSE: {rmse_list[rank]} Singular Values: {amount_sing_values_sub[rank]}')

In [None]:
amount_sing_values[0] = amount_sing_values[0] / 10

In [None]:
rank_range = range(1, len(df_final.columns))

fig, ax1 = plt.subplots()

colory1 = 'tab:red'
ax1.set_xlabel('Rank')
ax1.set_ylabel('Singular Values', color=colory1)
ax1.plot(rank_range, amount_sing_values, color=colory1)
ax1.plot(rank_range, amount_sing_values_sub, color=colory1)

ax2 = ax1.twinx()

colory2 = 'tab:blue'
ax2.set_ylabel('RMSE', color=colory2)
ax2.plot(rank_range, rmse_list, color=colory2)
ax2.plot(rank_range, rmse_list_sub, color=colory2)
ax2.tick_params(axis='y', labelcolor=colory2)

ax1.grid(True)
ax2.grid(False)

plt.show()

Presentation Content:

Choose the kth final rank, and explain why you choose that particular value.

Provide a figure showing the singular values vs. RMSE (compared with the global and regional models) for each of the three time periods, and use it to justify your kth rank.

Select at least three different stations based on their location in the city, choose each station from a different region.

Apply one or more of the SVD models made in the previous steps of the individual different stations chosen in the previous step by randomly sub-sampling your data matrix. Compare the results to your kth rank SVD model using the RMSE and the median absolute deviation.