In [3]:
import multiprocessing
import timeit

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from sktime.datatypes._series._check import (
    _index_equally_spaced,
    check_pddataframe_series,
    is_in_valid_index_types,
)
from sktime.datatypes._utilities import get_time_index
from sktime.utils.validation.series import is_integer_index


def tmpFunc(df):
    df["c"] = df.a + df.b
    return df


def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=4)(delayed(func)(group) for name, group in dfGrouped)
    return retLst


def np_all_old_df(X):
    diffs = np.diff(X)
    all_equal = np.all(diffs == diffs[0])
    return all_equal


obj_raw = pd.read_pickle("data.pickle")

obj_1 = obj_raw.reset_index()

obj_list = [obj_1, obj_1, obj_1, obj_1, obj_1, obj_1, obj_1, obj_1, obj_1]

for index, df in enumerate(obj_list):
    df = obj_list[index].copy()
    df["time_series"] = df["time_series"] + "flag" + str(index)
    obj_list[index] = df

obj = pd.concat(obj_list)
obj["date"] = obj["date"].dt.to_timestamp()
obj_indexed = obj.set_index(["time_series", "date"])

# obj_indexed.reset_index().groupby("time_series", as_index=False,group_keys=False).apply(lambda x: [check_pddataframe_series(x.set_index("date"), return_metadata = True)])

inst_inds = obj_indexed.index.get_level_values(0).unique()
inst_inds = np.unique(obj_indexed.index.get_level_values(0))

datetime_groupby = timeit.timeit(
    lambda: obj.groupby("time_series", as_index=False)["date"].apply(
        lambda x: np_all_old_df(x)
    ),
    number=1,
)
datetime_parallel = timeit.timeit(
    lambda: applyParallel(
        obj.groupby("time_series", as_index=False)["date"], np_all_old_df
    ),
    number=1,
)
datetime_list = timeit.timeit(
    lambda: [np_all_old_df(obj_indexed.loc[i].index) for i in inst_inds], number=1
)

datetime_df_groupby_1 = timeit.timeit(
    lambda: obj_indexed.groupby(
        level="time_series", group_keys=False, as_index=True
    ).apply(lambda df: check_pddataframe_series(df.droplevel(0), return_metadata=True)),
    number=1,
)
datetime_df_groupby_2 = timeit.timeit(
    lambda: obj_indexed.reset_index(-1)
    .groupby(level="time_series")
    .apply(
        lambda df: pd.DataFrame(
            check_pddataframe_series(
                df.set_index(obj_indexed.index.names[-1]), return_metadata=True
            )
        )
    ),
    number=1,
)
datetime_df_list = timeit.timeit(
    lambda: [
        check_pddataframe_series(obj_indexed.loc[i], return_metadata=True)
        for i in inst_inds
    ],
    number=1,
)


import polars as pl

objpl = pl.DataFrame(obj)
polars_test = timeit.timeit(
    lambda: objpl.groupby("time_series")
    .agg(pl.col("date").diff(null_behavior="drop").unique().len())
    .filter(pl.col("date") != 1),
    number=1,
)

a = 0

# timeit.timeit(lambda: obj_indexed.groupby(level="time_series", as_index=False).apply(lambda df: np_all_new(df.index.get_level_values(-1))), number =1)

obj = pd.concat(obj_list)
obj_indexed = obj.set_index(["time_series", "date"])

# obj_indexed.reset_index().groupby("time_series", as_index=False,group_keys=False).apply(lambda x: [check_pddataframe_series(x.set_index("date"), return_metadata = True)])

inst_inds = obj_indexed.index.get_level_values(0).unique()
inst_inds = np.unique(obj_indexed.index.get_level_values(0))

period_groupby = timeit.timeit(
    lambda: obj.groupby("time_series", as_index=False)["date"].apply(
        lambda x: np_all_old_df(x)
    ),
    number=1,
)
period_parallel = timeit.timeit(
    lambda: applyParallel(
        obj.groupby("time_series", as_index=False)["date"], np_all_old_df
    ),
    number=1,
)
period_list = timeit.timeit(
    lambda: [np_all_old_df(obj_indexed.loc[i].index) for i in inst_inds], number=1
)


period_df_groupby_1 = timeit.timeit(
    lambda: [
        check_pddataframe_series(obj_indexed.loc[i], return_metadata=True)
        for i in inst_inds
    ],
    number=1,
)
period_df_groupby_2 = timeit.timeit(
    lambda: obj_indexed.reset_index(-1)
    .groupby(level="time_series")
    .apply(
        lambda df: pd.DataFrame(
            check_pddataframe_series(
                df.set_index(obj_indexed.index.names[-1]), return_metadata=True
            )
        )
    ),
    number=1,
)
period_df_list = timeit.timeit(
    lambda: obj_indexed.groupby(
        level="time_series", group_keys=False, as_index=True
    ).apply(lambda df: check_pddataframe_series(df.droplevel(0), return_metadata=True)),
    number=1,
)

In [4]:
# diff check only with datetimeindex
print(datetime_groupby)
print(datetime_parallel)
print(datetime_list)

0.9469833999755792
2.2422068000014406
20.45972859999165


In [5]:
# diff check only with periodindex
print(period_groupby)
print(period_parallel)
print(period_list)

85.40576159997727
27.369318099983502
98.94617319997633


In [9]:
# full check_pddataframe_series check only  with datetimeindex
print(datetime_df_groupby_1)
print(datetime_df_groupby_2)
print(datetime_df_list)


2.3897417999978643
3.557807600009255
21.432788200007053


In [7]:
# full check_pddataframe_series with periodindex
print(period_df_groupby_1)
print(period_df_groupby_1)
print(period_df_list)

103.6553393000213
103.6553393000213
83.65653779997956


In [8]:
# diff check only with polars library
print(polars_test)

0.17094579999684356
