Loading data:

In [2]:
import numpy as np
import pandas as pd
import json

with open("proj5_params.json", "r") as file:
    params = json.loads(file.read())

timeseries = pd.read_csv("proj5_timeseries.csv")

In [3]:
import re

for name in timeseries.columns:
    new_columnname = re.sub(r"[^0-9a-zA-Z]", "_", name)
    new_columnname = new_columnname.lower()
    timeseries.rename(columns={name : new_columnname}, inplace=True)

first_col = timeseries.columns[0]
timeseries[first_col] = pd.to_datetime(timeseries[first_col], format="mixed")
timeseries.set_index(first_col, inplace=True)
timeseries = timeseries.asfreq(params["original_frequency"]) 

In [4]:
import pickle

with open("proj5_ex01.pkl", "wb") as file:
    pickle.dump(timeseries, file)

In [5]:
timeseries_2 = timeseries.asfreq(params["target_frequency"]) 

with open("proj5_ex02.pkl", "wb") as file:
    pickle.dump(timeseries_2, file)

In [6]:
downsample_periods = params["downsample_periods"]
downsample_units = params["downsample_units"]
period_str = f"{downsample_periods}{downsample_units}"

count_per_window = timeseries.resample(period_str).count()
expected_count = downsample_periods

downsampled = timeseries.resample(period_str).sum()

downsampled[count_per_window < expected_count] = float('nan')

downsampled.to_pickle("proj5_ex03.pkl")

In [7]:
try:
    import scipy
except Exception:
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scipy"])
    import scipy

In [8]:
rule = f'{params["upsample_periods"]}{params["upsample_units"]}'

upsampled = timeseries.resample(rule).asfreq()

upsampled_interpolated = upsampled.interpolate(
    method=params["interpolation"],
    order=params["interpolation_order"] if params["interpolation"] == "polynomial" else None
)

original_td = pd.Timedelta(1, unit=params["original_frequency"])
new_td = pd.Timedelta(params["upsample_periods"], unit=params["upsample_units"])
scaling_factor = new_td / original_td

numeric_cols = upsampled_interpolated.select_dtypes(include='number').columns
upsampled_interpolated[numeric_cols] *= scaling_factor

upsampled_interpolated.to_pickle("proj5_ex04.pkl")

In [53]:
sensor = f"{params['sensors_periods']}{params['sensors_units']}"

wide_df = pd.read_pickle('proj5_sensors.pkl')

wide_df = wide_df.pivot(columns='device_id', values='value')

freq = pd.date_range(wide_df.index.round(sensor).min(), wide_df.index.round(sensor).max(), freq=sensor)

wide_df = wide_df.reindex(freq.union(wide_df.index)).interpolate(method='linear')

wide_df = wide_df.reindex(freq)

wide_df = wide_df.dropna()

wide_df.to_pickle("proj5_ex05.pkl")