# **Anchor Google Drive file system**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# **Import libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import os
import joblib
import sys
from datetime import datetime
import numpy as np
import stat

# **Global definitions**

In [None]:
input_dir = "/content/drive/MyDrive/AD_SMART_CITY/C/Input/"
output_dir = "/content/drive/MyDrive/AD_SMART_CITY/C/Output/"

# **Load the Data**

In [None]:
fault_free_data = {}
faulty_data = {}
reference_data = {}

for dataset in os.listdir(input_dir):
  if os.path.isdir(input_dir + dataset) and dataset != ".ipynb_checkpoints":
    time_frame = dataset.split("_")[-1]
    fault_free_data[time_frame] = {}
    faulty_data[time_frame] = {}
    reference_data[time_frame] = {}

    for day in os.listdir(input_dir + dataset + "/board_0"):
      fault_free_data[time_frame][day.split(".")[0]] = pd.read_csv(input_dir + dataset + "/board_0/" + day)
      faulty_data[time_frame][day.split(".")[0]] = pd.read_csv(input_dir + dataset + "/board_13/" + day)
      reference_data[time_frame][day.split(".")[0]] = pd.read_csv(input_dir + dataset + "/arpa.csv")
      timestamps = list(reference_data[time_frame][day.split(".")[0]]["timestamp"])

      timestamps = [int(datetime.strptime(x, "%Y-%m-%d %H:%M:%S").timestamp()) for x in timestamps]

      reference_data[time_frame][day.split(".")[0]]["timestamp"] = timestamps
      start_timestamp = list(fault_free_data[time_frame][day.split(".")[0]]["timestamp"])[0]
      end_timestamp = list(fault_free_data[time_frame][day.split(".")[0]]["timestamp"])[-1]

      reference_data[time_frame][day.split(".")[0]] = reference_data[time_frame][day.split(".")[0]][(reference_data[time_frame][day.split(".")[0]]['timestamp'] >= start_timestamp) & (reference_data[time_frame][day.split(".")[0]]['timestamp'] <= end_timestamp)]


# **Aggregate normal data**

In [None]:
for time_frame in fault_free_data:
  for day in fault_free_data[time_frame]:
    fault_free_data[time_frame][day]["pm25"] = fault_free_data[time_frame][day][['pm25_0', 'pm25_1', 'pm25_2', 'pm25_3']].mean(axis=1)
    fault_free_data[time_frame][day] = fault_free_data[time_frame][day][["timestamp","pm25"]]

KeyError: "None of [Index(['pm25_0', 'pm25_1', 'pm25_2', 'pm25_3'], dtype='object')] are in the [columns]"

# **Calibration training**

In [None]:
calibration_models = {}

for time_frame in fault_free_data:
  calibration_models[time_frame] = {}
  merged_fault_free_data = pd.DataFrame()
  merged_reference_data = pd.DataFrame()
  for day in fault_free_data[time_frame]:
    merged_fault_free_data = pd.concat([merged_fault_free_data, fault_free_data[time_frame][day]], axis = 0, ignore_index = True)
    merged_reference_data = pd.concat([merged_reference_data, reference_data[time_frame][day]], axis = 0, ignore_index = True)

  merged_fault_free_data = merged_fault_free_data.sort_values("timestamp").reset_index(drop=True)
  merged_reference_data = merged_reference_data.sort_values("timestamp").reset_index(drop=True)

  merged_fault_free_reference_data = pd.merge_asof(merged_fault_free_data,merged_reference_data,on='timestamp',direction='nearest',tolerance=60)
  merged_fault_free_reference_data = merged_fault_free_reference_data.dropna()

  X = merged_fault_free_reference_data['pm25'].values.reshape(-1, 1)
  y = merged_fault_free_reference_data['arpa'].values

  calibration_models[time_frame] = LinearRegression()
  calibration_models[time_frame].fit(X,y)

  for day in fault_free_data[time_frame]:
    fault_free_data[time_frame][day]['pm25_calibrated'] = calibration_models[time_frame].predict(fault_free_data[time_frame][day]['pm25'].values.reshape(-1, 1))


# **Filter and aggregate faulty data**




In [None]:
individual_sensor_labels = ["pm25_0", "pm25_1", "pm25_2", "pm25_3"]

for time_frame in faulty_data:
  for day in faulty_data[time_frame]:
    aggregated_filtered_faulty_data = faulty_data[time_frame][day].copy()
    # smoothen the individual sensors

    window_size = 30 # length of the rolling window for computing the moving average

    for label in individual_sensor_labels:
      aggregated_filtered_faulty_data[label] = aggregated_filtered_faulty_data[label].rolling(window=window_size, min_periods=1).mean()

    # thresholding-based rule-out of individual sensors

    threshold = 5.0 # the threshold value

    aggregated_filtered_faulty_data = aggregated_filtered_faulty_data.where(aggregated_filtered_faulty_data >= threshold, np.nan)

    # row-wise aggregation of the individual sensors, excluding those with NaN values
    faulty_data[time_frame][day]["pm25"] = aggregated_filtered_faulty_data[['pm25_0', 'pm25_1', 'pm25_2', 'pm25_3']].mean(axis=1, skipna=True)


# **Calibration inference (both filtered and unfiltered nor aggregated)**

In [None]:
individual_sensor_labels = ["pm25_0", "pm25_1", "pm25_2", "pm25_3"]

for time_frame in faulty_data:
  for day in faulty_data[time_frame]:
    for label in individual_sensor_labels:
      faulty_data[time_frame][day][label + "_calibrated"] = calibration_models[time_frame].predict(faulty_data[time_frame][day][label].values.reshape(-1, 1))
    faulty_data[time_frame][day]["pm25_calibrated"] = calibration_models[time_frame].predict(faulty_data[time_frame][day]["pm25"].values.reshape(-1, 1))


# **Save results**

In [None]:
for time_frame in fault_free_data:
  if not os.path.exists(output_dir + "11_" + time_frame):
    os.mkdir(output_dir + "11_" + time_frame)
    os.mkdir(output_dir + "11_" + time_frame + "/board_0/")
    os.mkdir(output_dir + "11_" + time_frame + "/board_13/")
  for day in fault_free_data[time_frame]:
    fault_free_data[time_frame][day].to_csv(output_dir + "11_" + time_frame + "/board_0/" + day + ".csv", index=False)
    faulty_data[time_frame][day].to_csv(output_dir + "11_" + time_frame + "/board_13/" + day + ".csv", index=False)