In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d, CubicSpline

In [32]:
df = pd.read_csv("data/merged_data/merge.csv")
df = df.drop(df.columns[0], axis=1)

In [33]:
# Calculate percentage of missing data for each column
missing_percent = df.isna().mean() * 100

# Print the result
print("Percentage of missing data per column:")
print(missing_percent)


# Find a place where there are two missing points
print("Point before:", df.iloc[7470])
print("Missing point 1:", df.iloc[7471])
print("Missing point 2:", df.iloc[7472])
print("Point after:", df.iloc[7473])


Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      1.673998
no2_wrk_aux     1.673998
o3_wrk_aux      1.673998
temp            1.673998
rh              1.673998
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64
Point before: date             2023-11-30 21:00:00-05:00
node_id                              250.0
sensor_name     Myron J Francis Elementary
no_wrk_aux                        0.028038
no2_wrk_aux                       0.022489
o3_wrk_aux                        0.034419
temp                              8.394274
rh                               55.949033
no2_ref                               15.5
t_since_depl                          8013
Name: 7470, dtype: object
Missing point 1: date            2023-11-30 22:00:00-05:00
node_id                               NaN
sensor_name                           NaN
no_wrk_aux                            NaN
no2_wrk_aux                           NaN

In [34]:
df_linear = df.copy()
df_second = df.copy()
df_spline = df.copy()

In [35]:
# First order interpolation

columns_to_interpolate = ['no_wrk_aux', 'no2_wrk_aux', 'o3_wrk_aux', 'temp', 'rh']

df_linear[columns_to_interpolate] = df_linear[columns_to_interpolate].interpolate(method='linear', axis=0) # make sure that this is just between the most recent and next points

# Calculate percentage of missing data for each column
missing_percent = df_linear.isna().mean() * 100

print("Percentage of missing data per column:")
print(missing_percent)
print("Mean no2_wrk_aux:")
print(np.mean(df_linear["no2_wrk_aux"]))

# Check to make sure that the linear interpolation is reasonable across two missing points
print("Point before:", df_linear.iloc[7470])
print("Linear interpolated point 1:", df_linear.iloc[7471])
print("Linear interpolated point 2:", df_linear.iloc[7472])
print("Point after:", df_linear.iloc[7473])


Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      0.000000
no2_wrk_aux     0.000000
o3_wrk_aux      0.000000
temp            0.000000
rh              0.000000
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64
Mean no2_wrk_aux:
0.015905219939710162
Point before: date             2023-11-30 21:00:00-05:00
node_id                              250.0
sensor_name     Myron J Francis Elementary
no_wrk_aux                        0.028038
no2_wrk_aux                       0.022489
o3_wrk_aux                        0.034419
temp                              8.394274
rh                               55.949033
no2_ref                               15.5
t_since_depl                          8013
Name: 7470, dtype: object
Linear interpolated point 1: date            2023-11-30 22:00:00-05:00
node_id                               NaN
sensor_name                           NaN
no_wrk_aux                      

In [36]:
# Second order interpolation

columns_to_interpolate = ['no_wrk_aux', 'no2_wrk_aux', 'o3_wrk_aux', 'temp', 'rh']

#df_second[columns_to_interpolate] = df_second[columns_to_interpolate].interpolate(method='polynomial',order=2, axis=0) # could be higher or lower
df_second[columns_to_interpolate] = df_second[columns_to_interpolate].interpolate(method='quadratic', axis=0) # could be higher or lower


# Calculate percentage of missing data for each column
missing_percent = df_second.isna().mean() * 100

print("Percentage of missing data per column:")
print(missing_percent)
print("Mean no2_wrk_aux:")
print(np.mean(df_second["no2_wrk_aux"]))

# Check to make sure that the quadratic interpolation is reasonable across two missing points
print("Point before:", df_second.iloc[7470])
print("Quandratic interpolated point 1:", df_second.iloc[7471])
print("Quandratic interpolated point 2:", df_second.iloc[7472])
print("Point after:", df_second.iloc[7473])


Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      0.000000
no2_wrk_aux     0.000000
o3_wrk_aux      0.000000
temp            0.000000
rh              0.000000
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64
Mean no2_wrk_aux:
0.0158877537963075
Point before: date             2023-11-30 21:00:00-05:00
node_id                              250.0
sensor_name     Myron J Francis Elementary
no_wrk_aux                        0.028038
no2_wrk_aux                       0.022489
o3_wrk_aux                        0.034419
temp                              8.394274
rh                               55.949033
no2_ref                               15.5
t_since_depl                          8013
Name: 7470, dtype: object
Quandratic interpolated point 1: date            2023-11-30 22:00:00-05:00
node_id                               NaN
sensor_name                           NaN
no_wrk_aux                    

In [37]:
# Spline interpolation

columns_to_interpolate = ['no_wrk_aux', 'no2_wrk_aux', 'o3_wrk_aux', 'temp', 'rh']

df_spline[columns_to_interpolate] = df_spline[columns_to_interpolate].interpolate(method='spline',order=3, axis=0)

# Calculate percentage of missing data for each column
missing_percent = df_spline.isna().mean() * 100

print("Percentage of missing data per column:")
print(missing_percent)
print("Mean no2_wrk_aux:")
print(np.mean(df_spline["no2_wrk_aux"]))

# Check to make sure that the spline interpolation is reasonable across two missing points
print("Point before:", df_spline.iloc[7470])
print("Spline interpolated point 1:", df_spline.iloc[7471])
print("Spline interpolated point 2:", df_spline.iloc[7472])
print("Point after:", df_spline.iloc[7473])


Percentage of missing data per column:
date            0.000000
node_id         1.673998
sensor_name     1.673998
no_wrk_aux      0.000000
no2_wrk_aux     0.000000
o3_wrk_aux      0.000000
temp            0.000000
rh              0.000000
no2_ref         0.000000
t_since_depl    0.000000
dtype: float64
Mean no2_wrk_aux:
0.015886713679383303
Point before: date             2023-11-30 21:00:00-05:00
node_id                              250.0
sensor_name     Myron J Francis Elementary
no_wrk_aux                        0.028038
no2_wrk_aux                       0.022489
o3_wrk_aux                        0.034419
temp                              8.394274
rh                               55.949033
no2_ref                               15.5
t_since_depl                          8013
Name: 7470, dtype: object
Spline interpolated point 1: date            2023-11-30 22:00:00-05:00
node_id                               NaN
sensor_name                           NaN
no_wrk_aux                      

In [38]:
# Save dfs
df_linear.to_csv("data/interpolated_data/interp1.csv")
df_second.to_csv("data/interpolated_data/interp2.csv")
df_spline.to_csv("data/interpolated_data/interp3.csv")