In [None]:
import pandas as pd
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
import numpy as np
from scipy import interpolate

# %matplotlib widget
%matplotlib inline

In [None]:
FILE_NAME = 'all'
OUTPUT_DATA_FILE = f'{FILE_NAME}_data.parquet'
OUTPUT_METADATA_FILE = f'{FILE_NAME}_meta.parquet'

WINDOW_SIZE = 4

In [None]:
data = pd.read_parquet(f'{OUTPUT_DATA_FILE}')
metadata = pd.read_parquet(f'{OUTPUT_METADATA_FILE}')

In [None]:
data

In [None]:
metadata

In [None]:
# ax = data.plot.hist(by='creation_datetime')
datetimes = []
for i in data['creation_datetime']:
    datetimes.append(i.to_datetime64())

In [None]:
with plt.xkcd(scale=0.5):
    plt.figure(figsize=(1920/80,1080/80))
    plot = plt.hist(datetimes, bins=64)
    plt.savefig('hist.png')
    plt.show()

In [None]:
data['author'].value_counts()[:10]

In [None]:
data['author_id'].value_counts()[:10]

In [None]:
data.info()

In [None]:
metadata.info()

In [None]:
# import numpy as np
# # Generating a sample of 10000 timestamps and selecting 500 to randomize them
# df = pd.DataFrame(np.random.choice(pd.date_range(start=pd.to_datetime('2015-01-14'),periods = 10000, freq='S'), 500),  columns=['date'])
# # Setting the date as the index since the TimeGrouper works on Index, the date column is not dropped to be able to count
# df.set_index('date', drop=False, inplace=True)
# # Getting the histogram
# df.groupby(pd.TimeGrouper(freq='10Min')).count().plot(kind='bar')

In [None]:
# plt.plot(data['creation_datetime'].sort_values().diff()[1:].dt.total_seconds())
# plt.show()

In [None]:
AVG_REQ_PERSEC = 0.06
WINDOW_LENGTH_SEC = 7*24*60*60
WINDOW_SIZE = int(round(AVG_REQ_PERSEC * WINDOW_LENGTH_SEC))

# print(WINDOW_SIZE)

sorted_datetimes = data['creation_datetime'].sort_values(ignore_index=True)
rolling_sec_per_req = sorted_datetimes.diff()[1:].dt.total_seconds().rolling(WINDOW_SIZE, win_type='blackmanharris').mean()[WINDOW_SIZE:].to_numpy()
rolling_req_per_sec = 1.0/rolling_sec_per_req

In [None]:
print(f'sum rolling_sec_per_req: {np.sum(rolling_sec_per_req)}')
print(f'sum rolling_req_per_sec: {np.sum(rolling_req_per_sec)}')

print(f'From {sorted_datetimes.to_numpy()[0]} to {sorted_datetimes.to_numpy()[-1]}')
timespan = (sorted_datetimes.to_numpy()[-1] - sorted_datetimes.to_numpy()[0])/np.timedelta64(1, 's')
print(f'Total time: {timespan}')
print(f'Total messages: {len(sorted_datetimes)}')
print(f'Avg msg/sec: {len(sorted_datetimes)/timespan}')

In [None]:
with plt.xkcd(scale=0.5):
    plt.figure(figsize=(3840/80,2160/80))
    plt.plot(sorted_datetimes.to_numpy()[WINDOW_SIZE+1:], rolling_req_per_sec)
    plt.ylim(0, np.max(rolling_req_per_sec))
    plt.xlim(np.min(sorted_datetimes.to_numpy()[WINDOW_SIZE:]), np.max(sorted_datetimes.to_numpy()[WINDOW_SIZE:]))

    plt.savefig('test.png')
    plt.show()

In [None]:
OUTPUT_TS = 15*60.0

total_seconds = (sorted_datetimes[len(sorted_datetimes)-1] - sorted_datetimes[0]).total_seconds()

input_points = [i.total_seconds() for i in (sorted_datetimes - sorted_datetimes[0])]
output_points = [i*OUTPUT_TS for i in range(int(round(total_seconds/OUTPUT_TS)))] # [(sorted_datetimes[0] + pd.Timedelta(pd.offsets.Minute(i))).total_seconds() for i in range(total_minutes)]
input_data = sorted_datetimes.diff().dt.total_seconds()

output_data = np.interp(output_points[1:], input_points, input_data)
# output_data = interpolate.interp1d(input_points, input_data, kind='cubic')(output_points[1:])
# output_data = interpolate.UnivariateSpline(input_points, input_data)(output_points[1:])

start_datetime = sorted_datetimes[0].to_pydatetime()
output_timestamps = [start_datetime + timedelta(seconds=i) for i in output_points[1:]]

In [None]:
WINDOW_LENGTH_SEC = 30*24*60*60
WINDOW_SIZE_UNIFORM = int(round(WINDOW_LENGTH_SEC/OUTPUT_TS))
print(WINDOW_SIZE_UNIFORM)

output_rate = pd.Series(1.0/output_data).rolling(WINDOW_SIZE_UNIFORM, win_type='blackmanharris').mean().to_numpy()

with plt.xkcd(scale=0.5):
    plt.figure(figsize=(1920/80, 1080/80))
    plt.plot(output_timestamps[WINDOW_SIZE_UNIFORM:], output_rate[WINDOW_SIZE_UNIFORM:])

    plt.ylim(0, np.max(output_rate[WINDOW_SIZE_UNIFORM:]))
    plt.xlim(output_timestamps[0], output_timestamps[-1])

    plt.title(f'Averaged Smoothed Message Rate Over History ({WINDOW_LENGTH_SEC} sec window)')
    plt.xlabel('Datetime')
    plt.ylabel('msg/sec (avg)')

    plt.savefig('test.png')
    plt.show()

In [None]:
inp.shape

In [None]:
input_points[29157]

In [None]:
input_points[29158]

In [None]:
OUTPUT_TS = 60.0

output_points = [i*OUTPUT_TS for i in range(int(round(total_seconds/OUTPUT_TS)))]

In [None]:
tmp[1:]

In [None]:
len(input_points)

In [None]:
type((sorted_datetimes[0] + pd.Timedelta(pd.offsets.Minute(2))))

In [None]:
range(int(round(total_seconds/OUTPUT_TS)))

In [None]:
input_points

In [None]:
output_points

In [None]:
output_timestamps[:10]

In [None]:
output_timestamps[-10:]

In [None]:
sorted_datetimes[len(sorted_datetimes)-1]

In [None]:
sorted_datetimes[0]

In [None]:
output_points[-10:]

In [None]:
# for i in range(100):
#     print(data.content[i])
num = 1
for i in data.content.sample(n=10):
    print(f'{num} - {i}')
    num += 1