In [155]:
import csv
import datetime
import math
import pandas as pd
import random

In [144]:
# resolution 1 hour
dates = [
    datetime.datetime(2020, 10, 1, 10, 0),
    datetime.datetime(2020, 10, 1, 11, 0),
    datetime.datetime(2020, 10, 1, 12, 0),
    datetime.datetime(2020, 10, 1, 13, 0),
    datetime.datetime(2020, 10, 1, 15, 0),
    datetime.datetime(2020, 10, 1, 18, 0),
    datetime.datetime(2020, 10, 1, 20, 0),
    datetime.datetime(2020, 10, 1, 21, 0),
]

values = [
    0.312, 0.121, 0.1372, 0.73221, 0.17, 0.18281, 0.12, 0.1727
]

In [130]:
# resolution 1 hour, swapped
dates = [
    datetime.datetime(2020, 10, 1, 21, 0),
    datetime.datetime(2020, 10, 1, 22, 0),
    datetime.datetime(2020, 10, 1, 23, 0),
    datetime.datetime(2020, 10, 2,  7, 0),
    datetime.datetime(2020, 10, 2,  8, 0),
    datetime.datetime(2020, 10, 2, 10, 0),
    datetime.datetime(2020, 10, 2, 12, 0),
    datetime.datetime(2020, 10, 2, 15, 0),
]

values = [
    0.312, 0.121, 0.1372, 0.73221, 0.17, 0.18281, 0.12, 0.1727
]

In [82]:
# resolution 1 minute 
dates = [
    datetime.datetime(2020, 10, 1, 10, 10, 0),
    datetime.datetime(2020, 10, 1, 10, 11, 0),
    datetime.datetime(2020, 10, 1, 10, 12, 0),
    datetime.datetime(2020, 10, 1, 10, 13, 0),
    datetime.datetime(2020, 10, 1, 10, 15, 0),
    datetime.datetime(2020, 10, 1, 10, 18, 0),
    datetime.datetime(2020, 10, 1, 10, 20, 0),
    datetime.datetime(2020, 10, 1, 10, 21, 0),
]

values = [
    0.312, 0.121, 0.1372, 0.73221, 0.17, 0.18281, 0.12, 0.1727
]

In [154]:
# resolution 10 seconds 
dates = [
    datetime.datetime(2020, 10, 1, 10, 10, 0),
    datetime.datetime(2020, 10, 1, 10, 10, 10),
    datetime.datetime(2020, 10, 1, 10, 10, 20),
    datetime.datetime(2020, 10, 1, 10, 10, 30),
    datetime.datetime(2020, 10, 1, 10, 10, 50),
    datetime.datetime(2020, 10, 1, 10, 11, 20),
    datetime.datetime(2020, 10, 1, 10, 11, 40),
    datetime.datetime(2020, 10, 1, 10, 12, 10),
]

values = [
    0.312, 0.121, 0.1372, 0.73221, 0.17, 0.18281, 0.12, 0.1727
]

In [161]:
def generate_rnd_dataset(start, end, resolution, min_step, max_step):
    dates = []
    values = []

    curr = start
    first = True

    while curr < end:
        dates.append(curr)
        values.append(random.random())

        if first:
            step = 1
            first = False
        else:
            step = random.randint(min_step, max_step)

        curr += resolution * step

    return (dates, values)

In [162]:
dates, values = generate_rnd_dataset(datetime.datetime(2020, 1, 1, 0, 0, 0),
                                     datetime.datetime(2022, 1, 1, 0, 0, 0),
                                     datetime.timedelta(hours=1),
                                     1, 10)

In [163]:
input_df = pd.Series(data=values, index=pd.to_datetime(dates))

resolution = datetime.timedelta(minutes=1)
start_date = datetime.datetime(2021, 1, 1)
end_date = datetime.datetime(2021, 1, 2)

In [165]:
def export_csv_2_col(data, file_name):
    with open(file_name, "w") as f:
        writer = csv.writer(f, delimiter=";")
        for date, value in data:
            writer.writerow([date, value])

def export_csv(data, file_name):
    with open(file_name, "w") as f:
        writer = csv.writer(f, delimiter=";")
        for value in data:
            writer.writerow([value])

def transform_data(input_df, resolution, start_date, end_date):
    assert len(input_df) >= 2, "must provide more than two datapoints"
    assert resolution.seconds <= 60, "resolutions lower than one minute not supported"

    # convert to a list of dates and corresponding values
    input_data = []
    for idx, value in input_df.iteritems():
        input_data.append((idx, value))

    # get resolution and time frame of input data
    input_resolution = input_data[1][0] - input_data[0][0]
    input_resolution_seconds = input_resolution.seconds
    input_start_date = input_data[0][0]
    input_end_date = input_data[-1][0]

    # linearly interpolate missing values in input data
    interp_input_data = [input_data[0], input_data[1]]
    for i in range(2, len(input_data)):
        prev = input_data[i - 1]
        curr = input_data[i]
        dist = curr[0] - prev[0]

        # distance between data points is equal to resolution
        if dist == input_resolution:
            interp_input_data.append(curr)
            continue

        # check if distance is evenly divisible by resolution
        rem = dist.seconds % input_resolution_seconds
        if rem != 0:
            raise Exception("inconsistent distance between data points")

        steps = math.floor(dist.seconds / input_resolution_seconds)
        step = (curr[1] - prev[1]) / steps

        for j in range(1, steps):
            interp_input_data.append((prev[0] + (j * input_resolution), prev[1] + (j * step)))

        interp_input_data.append(curr)

    # transform input data to minute resolution
    input_data_minute_res = []

    if input_resolution_seconds > 60:
        # interpolate datapoints to get minute resolution
        interp_steps = math.floor(input_resolution_seconds / 60)
        minute_res = datetime.timedelta(seconds=60)

        for j in range(1, len(interp_input_data)):
            prev = interp_input_data[j - 1][1]
            curr = interp_input_data[j][1]
            step = (curr - prev) / interp_steps

            input_data_minute_res.append(interp_input_data[j - 1])
            for k in range(1, interp_steps):
                input_data_minute_res.append((interp_input_data[j - 1][0] + (k * minute_res), prev + (k * step)))

    elif input_resolution_seconds < 60:
        # take the average of datapoints within a minute
        curr_sum = interp_input_data[0][1]
        curr_cnt = 1

        curr_date = interp_input_data[0][0]
        curr_start_date = curr_date

        for j in range(1, len(interp_input_data)):
            next_date = interp_input_data[j][0]

            # check if we're at the next minute
            if next_date.minute != curr_date.minute:
                input_data_minute_res.append((curr_start_date, curr_sum / curr_cnt))
                curr_sum = 0
                curr_cnt = 0
                curr_start_date = next_date

            curr_sum += interp_input_data[j][1]
            curr_cnt += 1
            curr_date = next_date

        if curr_sum > 0:
            input_data_minute_res.append((curr_start_date, curr_sum / curr_cnt))
    else:
        input_data_minute_res = interp_input_data

    export_csv_2_col(input_data_minute_res, "./input_data_minute_res.csv")

    # now calculate averages for every unique (weekday, month) pair that we have available
    available_datapoints = dict()
    for date, value in input_data_minute_res:
        weekday = date.weekday()
        month = date.month
        key = (weekday, month)

        if not key in available_datapoints:
            available_datapoints[key] = ([0 for _ in range(0, 24 * 60)], [0 for _ in range(0, 24 * 60)])

        minute_of_day = date.hour * 60 + date.minute
        available_datapoints[key][0][minute_of_day] += value
        available_datapoints[key][1][minute_of_day] += 1

    for value in available_datapoints.values():
        for j in range(0, len(value[0])):
            if value[1][j] <= 1:
                continue

            value[0][j] /= value[1][j]

    # build the result by finding the closest (weekday, month) pair for every required day
    result_data = []

    curr_date = start_date
    curr_day_data = None
    prev_weekday = None

    while curr_date < end_date:
        curr_weekday = curr_date.weekday()
        if prev_weekday is None or curr_weekday != prev_weekday:
            curr_month = curr_date.month
            prev_weekday = curr_weekday

            if (curr_weekday, curr_month) in available_datapoints:
                # use the available data for the day
                curr_day_data = available_datapoints[(curr_weekday, curr_month)]
            else:
                # find the closest available data point
                min_dist = math.inf
                for key, value in available_datapoints.items():
                    dist = abs(key[0] - curr_weekday) + abs(key[1] - curr_month)
                    if dist >= min_dist:
                        continue

                    min_dist = dist
                    curr_day_data = value

        minute_of_day = curr_date.hour * 60 + curr_date.minute
        result_data.append((curr_date, curr_day_data[0][minute_of_day]))
        
        curr_date += resolution

    export_csv_2_col(result_data, "./result.csv")

transform_data(input_df, resolution, start_date, end_date)