## Imports

In [87]:
import numpy as np

## Load CSV Data

In [2]:
def load_csv_data(filepath):
    data_df = pd.read_csv(filepath)
    return data_df

filepath = "../data/aws_1hour.csv"
data_df = load_csv_data(filepath)

## Parse Data

In [59]:
parsed_data_df = data_df.drop(columns=["FID", "the_geom", "code", "qc_flags", "wind_speed"])  # Remove unnecessary columns (wind speed is removed since most of its values are nan)
parsed_data_df["timestamp"] = pd.to_datetime(parsed_data_df["timestamp"])  # Convert timestamps to timedate format
parsed_data_df.drop_duplicates("timestamp", inplace=True)
parsed_data_df = parsed_data_df.set_index("timestamp")  
parsed_data_df = parsed_data_df.sort_index()
parsed_data_df

Unnamed: 0_level_0,air_pressure,air_temperature,relative_humidity,precipitation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2003-12-01 00:00:00,987.84,8.32,96.37,1.20
2003-12-01 01:00:00,987.31,8.27,96.48,1.10
2003-12-01 02:00:00,986.98,8.35,96.90,0.50
2003-12-01 03:00:00,986.56,8.86,97.23,0.10
2003-12-01 04:00:00,986.25,9.69,95.93,0.10
...,...,...,...,...
2024-11-19 11:00:00,979.99,8.57,96.55,2.72
2024-11-19 12:00:00,980.17,8.42,94.12,0.48
2024-11-19 13:00:00,981.22,6.59,93.31,1.18
2024-11-19 14:00:00,982.19,5.11,93.40,0.99


## Missing Values

In [60]:
parsed_data_df.isna().sum()

air_pressure         19
air_temperature      23
relative_humidity    19
precipitation        19
dtype: int64

In [61]:
parsed_data_df.dropna(inplace=True)
parsed_data_df.isna().sum()

air_pressure         0
air_temperature      0
relative_humidity    0
precipitation        0
dtype: int64

## Missing Timestamps

In [63]:
start_date = parsed_data_df.index[0]
end_date = parsed_data_df.index[-1]
full_timestamps_df = pd.DataFrame(pd.date_range(start=start_date, end=end_date, freq="1h"))
print(f"There are {full_timestamps_df.shape[0] - parsed_data_df.shape[0]} missing timestamps")

There are 147 missing timestamps


### Interpolate Missing Timestamps

In [74]:
full_timestamps_df.columns=["timestamp"]
full_timestamps_df.set_index("timestamp", inplace=True)
full_timestamps_df

2003-12-01 00:00:00
2003-12-01 01:00:00
2003-12-01 02:00:00
2003-12-01 03:00:00
2003-12-01 04:00:00
...
2024-11-19 11:00:00
2024-11-19 12:00:00
2024-11-19 13:00:00
2024-11-19 14:00:00
2024-11-19 15:00:00


In [112]:
parsed_data_df = parsed_data_df.join(full_timestamps_df, how="outer").interpolate()
parsed_data_df.isna().sum()

air_pressure         0
air_temperature      0
relative_humidity    0
precipitation        0
dtype: int64

In [113]:
parsed_data_df.to_csv("../data/preprocessed_aws_1hour.csv")

In [116]:
new_df = pd.read_csv("../data/preprocessed_aws_1hour.csv").set_index("timestamp")
new_df

Unnamed: 0_level_0,air_pressure,air_temperature,relative_humidity,precipitation
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2003-12-01 00:00:00,987.84,8.32,96.37,1.20
2003-12-01 01:00:00,987.31,8.27,96.48,1.10
2003-12-01 02:00:00,986.98,8.35,96.90,0.50
2003-12-01 03:00:00,986.56,8.86,97.23,0.10
2003-12-01 04:00:00,986.25,9.69,95.93,0.10
...,...,...,...,...
2024-11-19 11:00:00,979.99,8.57,96.55,2.72
2024-11-19 12:00:00,980.17,8.42,94.12,0.48
2024-11-19 13:00:00,981.22,6.59,93.31,1.18
2024-11-19 14:00:00,982.19,5.11,93.40,0.99
