<a href="https://colab.research.google.com/github/fahaerte/LSIES-FinalAssignment/blob/main/Task3_SVDs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Execute the following cell to connect your google drive to the colab. Needed to load the data stored on your personal google drive. In case you use a local environment just comment out the following lines of code.

In [None]:
# Environment variables
drive = True
install_packages = False

# If necessary, install packages
if install_packages:
  !pip install Basemap basemap-data-hires
  !pip install pandas matplotlib numpy
  !pip install sklearn
  !pip install datetime

# Imports
import os
import pandas as pd
import numpy as np
from csv import reader
from sklearn.metrics import r2_score
from datetime import datetime
from datetime import timedelta as tdelta
from datetime import time as time
import warnings

# Mount drive if needed
if drive:
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/Colab\ Notebooks/LSIES

from Helper_funcs import *
from SensorPositions_Func import *
from SVD_Func import *

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Depending on the environemnt, set directory to data
if drive:
  dir_path = '/content/drive/MyDrive/data/'
else :
  dir_path = './data/'

folders = ['region_1_mustamäe_kristiine', 'region_2_data_kesklinn', 'region_3_kadriorg_lasnamäe', 'region_4_ülemiste']
csv_file_name_sensor_positions = 'sensor_positions.csv'
columns_sensors_positions = ['sensor_name', 'latitude', 'longitude', 'region']

In [2]:
#Import data
df_data_incomplete = import_sensor_data(dir_path)

#interpolate data using 'nearest' method
list_df_other = []
list_df_linear = []
list_df_nearest = []

temp_df = df_data_incomplete.interpolate(method='nearest')
df_data_nearest = temp_df.interpolate(limit_area=None, method='backfill')

temp_df = df_data_incomplete.interpolate(method='linear')
df_data_linear = temp_df.interpolate(limit_area=None, method='backfill')
df_data_linear.apply(lambda x: np.round(x, 1), 1);

In [3]:
# Compute indices of timestamps
day_index = np.where(np.logical_and(df_data_linear.index.time > pd.to_datetime('07:00').time(),
                     df_data_linear.index.time < pd.to_datetime('19:00').time()))[0]

evening_index = np.where(np.logical_and(df_data_linear.index.time >= pd.to_datetime('19:00').time(),
                     df_data_linear.index.time <= pd.to_datetime('23:00').time()))[0]
                     
night_index = np.where(np.logical_or(
    (df_data_linear.index.time > pd.to_datetime('23:00').time()),
    (df_data_linear.index.time <= pd.to_datetime('07:00').time())))[0]

In [4]:
# Compute SVDs, RMSE and SingValues for every region
sensor_positions_df = import_sensor_positions(dir_path, csv_file_name_sensor_positions)

d = {'sensor names':sensor_positions_df['sensor_name'],'region':sensor_positions_df['region']}
sensors = pd.DataFrame(d)
sensors_region1 = sensors[sensors['region'] == 1]['sensor names']
sensors_region2 = sensors[sensors['region'] == 2]['sensor names']
sensors_region3 = sensors[sensors['region'] == 3]['sensor names']
sensors_region4 = sensors[sensors['region'] == 4]['sensor names']

del d, sensor_positions_df, csv_file_name_sensor_positions

In [None]:
# Compute global SVDs
df_without_timestamp = df_data_linear.reset_index().iloc[:, 1:len(df_data_linear.columns)+1]
U_global, s_global, Vt_global = np.linalg.svd(df_without_timestamp)
del df_without_timestamp

In [None]:
# Compute RMSE and SingVal based on time period for Global SVD
rmse_list_day, amount_sing_values_day = calc_RMSE_sinVal_based_on_time_period(U_global, s_global, Vt_global, df_data_linear, day_index)
rmse_list_evening, amount_sing_values_evening = calc_RMSE_sinVal_based_on_time_period(U_global, s_global, Vt_global, df_data_linear, evening_index)
rmse_list_night, amount_sing_values_night = calc_RMSE_sinVal_based_on_time_period(U_global, s_global, Vt_global, df_data_linear, night_index)

In [None]:
# Plot RMSE and Singular Values of global SVD
plot_rmse_sinval(rmse_list_day, amount_sing_values_day, 'Global model - Day period')
plot_rmse_sinval(rmse_list_evening, amount_sing_values_evening, 'Global model - Evening period')
plot_rmse_sinval(rmse_list_night, amount_sing_values_night, 'Global model - Night period')

In [None]:
# Collect garbage
del rmse_list_day, amount_sing_values_day, rmse_list_evening, amount_sing_values_evening, rmse_list_night, amount_sing_values_night
del temp_df, df_data_incomplete

In [None]:
# Compute SVDs for region 1
U_reg1, s_reg1, Vt_reg1 = np.linalg.svd(df_data_linear[sensors_region1])

In [None]:
# Region 1
rmse_list_day_reg1, amount_sing_values_day_reg1 = calc_RMSE_sinVal_based_on_time_period(U_reg1, s_reg1, Vt_reg1, df_data_linear[sensors_region1], day_index)
rmse_list_evening_reg1, amount_sing_values_evening_reg1 = calc_RMSE_sinVal_based_on_time_period(U_reg1, s_reg1, Vt_reg1, df_data_linear[sensors_region1], evening_index)
rmse_list_night_reg1, amount_sing_values_night_reg1 = calc_RMSE_sinVal_based_on_time_period(U_reg1, s_reg1, Vt_reg1, df_data_linear[sensors_region1], night_index)

# Plot region 1
plot_rmse_sinval(rmse_list_day_reg1, amount_sing_values_day_reg1, 'Region 1 model - Day period')
plot_rmse_sinval(rmse_list_evening_reg1, amount_sing_values_evening_reg1, 'Region 1 model - Evening period')
plot_rmse_sinval(rmse_list_night_reg1, amount_sing_values_night_reg1, 'Region 1 model - Night period')

del rmse_list_day_reg1, rmse_list_evening_reg1, rmse_list_night_reg1, amount_sing_values_day_reg1, amount_sing_values_evening_reg1, amount_sing_values_night_reg1

In [None]:
# Compute SVDs for region 2
U_reg2, s_reg2, Vt_reg2 = np.linalg.svd(df_data_linear[sensors_region2])

In [None]:
# Region 2
rmse_list_day_reg2, amount_sing_values_day_reg2 = calc_RMSE_sinVal_based_on_time_period(U_reg2, s_reg2, Vt_reg2, df_data_linear[sensors_region2], day_index)
rmse_list_evening_reg2, amount_sing_values_evening_reg2 = calc_RMSE_sinVal_based_on_time_period(U_reg2, s_reg2, Vt_reg2, df_data_linear[sensors_region2], evening_index)
rmse_list_night_reg2, amount_sing_values_night_reg2 = calc_RMSE_sinVal_based_on_time_period(U_reg2, s_reg2, Vt_reg2, df_data_linear[sensors_region2], night_index)

# Plot region 2
plot_rmse_sinval(rmse_list_day_reg2, amount_sing_values_day_reg2, 'Region 2 model - Day period')
plot_rmse_sinval(rmse_list_evening_reg2, amount_sing_values_evening_reg2, 'Region 2 model - Evening period')
plot_rmse_sinval(rmse_list_night_reg2, amount_sing_values_night_reg2, 'Region 2 model - Night period')

del rmse_list_day_reg2, rmse_list_evening_reg2, rmse_list_night_reg2, amount_sing_values_day_reg2, amount_sing_values_evening_reg2, amount_sing_values_night_reg2

In [None]:
# Compute SVDs for region 3
U_reg3, s_reg3, Vt_reg3 = np.linalg.svd(df_data_linear[sensors_region3])

In [None]:
# Region 3
rmse_list_day_reg3, amount_sing_values_day_reg3 = calc_RMSE_sinVal_based_on_time_period(U_reg3, s_reg3, Vt_reg3, df_data_linear[sensors_region3], day_index)
rmse_list_evening_reg3, amount_sing_values_evening_reg3 = calc_RMSE_sinVal_based_on_time_period(U_reg3, s_reg3, Vt_reg3, df_data_linear[sensors_region3], evening_index)
rmse_list_night_reg3, amount_sing_values_night_reg3 = calc_RMSE_sinVal_based_on_time_period(U_reg3, s_reg3, Vt_reg3, df_data_linear[sensors_region3], night_index)

# Plot region 3
plot_rmse_sinval(rmse_list_day_reg3, amount_sing_values_day_reg3, 'Region 3 model - Day period')
plot_rmse_sinval(rmse_list_evening_reg3, amount_sing_values_evening_reg3, 'Region 3 model - Evening period')
plot_rmse_sinval(rmse_list_night_reg3, amount_sing_values_night_reg3, 'Region 3 model - Night period')

del rmse_list_day_reg3, rmse_list_evening_reg3, rmse_list_night_reg3, amount_sing_values_day_reg3, amount_sing_values_evening_reg3, amount_sing_values_night_reg3

In [None]:
# Compute SVDs for region 4
U_reg4, s_reg4, Vt_reg4 = np.linalg.svd(df_data_linear[sensors_region4])

In [None]:
# Region 4
rmse_list_day_reg4, amount_sing_values_day_reg4 = calc_RMSE_sinVal_based_on_time_period(U_reg4, s_reg4, Vt_reg4, df_data_linear[sensors_region4], day_index)
rmse_list_evening_reg4, amount_sing_values_evening_reg4 = calc_RMSE_sinVal_based_on_time_period(U_reg4, s_reg4, Vt_reg4, df_data_linear[sensors_region4], evening_index)
rmse_list_night_reg4, amount_sing_values_night_reg4 = calc_RMSE_sinVal_based_on_time_period(U_reg4, s_reg4, Vt_reg4, df_data_linear[sensors_region4], night_index)

# Plot region 4
plot_rmse_sinval(rmse_list_day_reg4, amount_sing_values_day_reg4, 'Region 4 model - Day period')
plot_rmse_sinval(rmse_list_evening_reg4, amount_sing_values_evening_reg4, 'Region 4 model - Evening period')
plot_rmse_sinval(rmse_list_night_reg4, amount_sing_values_night_reg4, 'Region 4 model - Night period')

del rmse_list_day_reg4, rmse_list_evening_reg4, rmse_list_night_reg4, amount_sing_values_day_reg4, amount_sing_values_evening_reg4, amount_sing_values_night_reg4

In [None]:
# Pick 4 stations and apply the different SVD models
stat_list = [
    {'id': '237B', 'region': 1, 'real_data': df_data_linear['237B']},
    {'id': '23B1', 'region': 2, 'real_data': df_data_linear['23B1']},
    {'id': '22CC', 'region': 3, 'real_data': df_data_linear['22CC']},
    {'id': '2329', 'region': 4, 'real_data': df_data_linear['2329']}
]

# Compute global SVDs
df_without_timestamp = df_data_linear.reset_index().iloc[:, 1:len(df_data_linear.columns)+1]
U_global, s_global, Vt_global = np.linalg.svd(df_without_timestamp)
s_global = keep_low_rank_model_of_svd(df=df_data_linear, s=s_global, rank=110) # RMSE approx 1.5 with a rank of 110

for stat in stat_list:
  stat['recon_global'] = apply_svd(df_data_linear, stat['id'], U_global, s_global, Vt_global)
  stat = compare_results(stat, 'comp_to_global', stat['real_data'], stat['recon_global'])

del df_without_timestamp, U_global, s_global, Vt_global

# Compute SVDs for region 1
U_reg1, s_reg1, Vt_reg1 = np.linalg.svd(df_data_linear[sensors_region1])
s_reg1 = keep_low_rank_model_of_svd(df=df_data_linear[sensors_region1], s=s_reg1, rank=40) # RMSE approx 1 with a rank of 40
stat_list[0]['recon_reg'] = apply_svd(df_data_linear[sensors_region1], stat_list[0]['id'], U_reg1, s_reg1, Vt_reg1)
stat_list[0] = compare_results(stat_list[0], 'comp_to_recon_reg', stat_list[0]['real_data'], stat_list[0]['recon_reg'])

del U_reg1, s_reg1, Vt_reg1

# Compute SVDs for region 2
U_reg2, s_reg2, Vt_reg2 = np.linalg.svd(df_data_linear[sensors_region2])
s_reg2 = keep_low_rank_model_of_svd(df=df_data_linear[sensors_region2], s=s_reg2, rank=83) # RMSE approx 1 with a rank of 83
stat_list[1]['recon_reg'] = apply_svd(df_data_linear[sensors_region2], stat_list[1]['id'], U_reg2, s_reg2, Vt_reg2)
stat_list[1] = compare_results(stat_list[1], 'comp_to_recon_reg', stat_list[1]['real_data'], stat_list[1]['recon_reg'])

del U_reg2, s_reg2, Vt_reg2

# Compute SVDs for region 3
U_reg3, s_reg3, Vt_reg3 = np.linalg.svd(df_data_linear[sensors_region3])
s_reg3 = keep_low_rank_model_of_svd(df=df_data_linear[sensors_region3], s=s_reg3, rank=35) # RMSE approx 1 with a rank of 35
stat_list[2]['recon_reg'] = apply_svd(df_data_linear[sensors_region3], stat_list[2]['id'], U_reg3, s_reg3, Vt_reg3)
stat_list[2] = compare_results(stat_list[2], 'comp_to_recon_reg', stat_list[2]['real_data'], stat_list[2]['recon_reg'])

del U_reg3, s_reg3, Vt_reg3

# Compute SVDs for region 4
U_reg4, s_reg4, Vt_reg4 = np.linalg.svd(df_data_linear[sensors_region4])
s_reg4 = keep_low_rank_model_of_svd(df=df_data_linear[sensors_region4], s=s_reg4, rank=6) # RMSE approx 1 with a rank of 6
stat_list[3]['recon_reg'] = apply_svd(df_data_linear[sensors_region4], stat_list[3]['id'], U_reg4, s_reg4, Vt_reg4)
stat_list[3] = compare_results(stat_list[3], 'comp_to_recon_reg', stat_list[3]['real_data'], stat_list[3]['recon_reg'])

del U_reg4, s_reg4, Vt_reg4

In [6]:
# Print values
for stat in stat_list:
  fstring = f"SensorId, {stat['id']} in region {stat['region']}: RMSE: {stat['comp_to_recon_reg_rmse']} MAD: {stat['comp_to_recon_reg_mad']}"
  print(fstring)



SensorId, 237B in region 1: RMSE: 0.7183918793822568 MAD: 4.297528286525896
SensorId, 23B1 in region 2: RMSE: 0.668416066262428 MAD: 3.496353779639662
SensorId, 22CC in region 3: RMSE: 1.7926991137765367 MAD: 4.2969234810456385
SensorId, 2329 in region 4: RMSE: 1.039761680276122 MAD: 2.519483145556034
