In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d, CubicSpline

In [40]:
df = pd.read_csv("processed_data_1/merge.csv")
df = df.drop(df.columns[0], axis=1)

In [41]:
# Calculate percentage of missing data for each column
missing_percent = df.isna().mean() * 100

# Print the result
print("Percentage of missing data per column:")
print(missing_percent)


Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      1.673998
no2_wrk_aux     1.673998
o3_wrk_aux      1.673998
temp            1.673998
rh              1.673998
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64


In [42]:
df_linear = df.copy()
df_second = df.copy()
df_spline = df.copy()

In [47]:
# First order interpolation

columns_to_interpolate = ['no_wrk_aux', 'no2_wrk_aux', 'o3_wrk_aux', 'temp', 'rh']

df_linear[columns_to_interpolate] = df_linear[columns_to_interpolate].interpolate(method='linear', axis=0)

# Calculate percentage of missing data for each column
missing_percent = df_linear.isna().mean() * 100

print("Percentage of missing data per column:")
print(missing_percent)
print("Mean no2_wrk_aux:")
print(np.mean(df_linear["no2_wrk_aux"]))


Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      0.000000
no2_wrk_aux     0.000000
o3_wrk_aux      0.000000
temp            0.000000
rh              0.000000
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64
Mean no2_wrk_aux:
0.015905219939710162


In [48]:
# Second order interpolation

columns_to_interpolate = ['no_wrk_aux', 'no2_wrk_aux', 'o3_wrk_aux', 'temp', 'rh']

df_second[columns_to_interpolate] = df_second[columns_to_interpolate].interpolate(method='polynomial',order=2, axis=0)

# Calculate percentage of missing data for each column
missing_percent = df_second.isna().mean() * 100

print("Percentage of missing data per column:")
print(missing_percent)
print("Mean no2_wrk_aux:")
print(np.mean(df_second["no2_wrk_aux"]))


Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      0.000000
no2_wrk_aux     0.000000
o3_wrk_aux      0.000000
temp            0.000000
rh              0.000000
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64
Mean no2_wrk_aux:
0.0158877537963075


In [46]:
# Spline interpolation

columns_to_interpolate = ['no_wrk_aux', 'no2_wrk_aux', 'o3_wrk_aux', 'temp', 'rh']

df_spline[columns_to_interpolate] = df_spline[columns_to_interpolate].interpolate(method='spline',order=3, axis=0)

# Calculate percentage of missing data for each column
missing_percent = df_spline.isna().mean() * 100

print("Percentage of missing data per column:")
print(missing_percent)
print("Mean no2_wrk_aux:")
print(np.mean(df_spline["no2_wrk_aux"]))

Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      0.000000
no2_wrk_aux     0.000000
o3_wrk_aux      0.000000
temp            0.000000
rh              0.000000
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64
Mean no2_wrk_aux:
0.015886713679383303
