In [None]:
# https://takeout.google.com/

In [None]:
import pandas as pd

from colassigner import get_all_cols
from infostop import Infostop
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KDTree

from export_data import dump_data, parse_ping_df, parse_sem_df
from stops import DaySetup, PingFeatures, proc_device_pings, StopFeatures

In [None]:
# dump_data("takeout-20220326T104444Z-001.zip")

In [None]:
raw_ping_df = pd.read_parquet("pings.parquet").assign()
raw_ping_df.shape

In [None]:
raw_ping_df.loc[:, lambda df: df.isna().mean() < 0.5].head()

In [None]:
raw_semantic_df = pd.read_parquet("semantic.parquet")
raw_semantic_df.shape

In [None]:
raw_semantic_df.head().T

In [None]:
semantic_df = parse_sem_df(raw_semantic_df.sort_values("startTimestamp"))
ping_df = parse_ping_df(raw_ping_df)

In [None]:
semantic_df.head().T

In [None]:
ping_df.head()

In [None]:
ping_df[PingFeatures.datetime].diff().describe()

In [None]:
def get_career(df):
    return (
        df.loc[
            lambda df: df[[StopFeatures.is_home, StopFeatures.is_work]].any(axis=1), :
        ]
        .assign(sdate=lambda df: df[StopFeatures.interval.start].astype(str).str[:10])
        .groupby([StopFeatures.is_work, StopFeatures.destination_label])
        .agg(
            place=pd.NamedAgg(StopFeatures.info, "first"),
            start=pd.NamedAgg("sdate", "min"),
            end=pd.NamedAgg("sdate", "max"),
            count=pd.NamedAgg("sdate", "count"),
        )
        .reset_index()
        .drop(StopFeatures.destination_label, axis=1)
        .sort_values(["is_work", "end"])
    )

In [None]:
get_career(semantic_df)

In [None]:
semantic_df.dropna(subset=[StopFeatures.info]).loc[lambda df: df[StopFeatures.info].str.contains("Pilis"), :]

In [None]:
(
    semantic_df.loc[
        lambda df: df[StopFeatures.destination_label] == "ChIJi_oEmDN6akcRb4fM_7BFGR8",
        :,
    ]
    .assign(
        count=1,
        days=lambda df: (
            df[StopFeatures.interval.end] - df[StopFeatures.interval.start]
        ).dt.total_seconds()
        / 60 ** 2 / 24,
    )
    .set_index(StopFeatures.interval.start)
    .resample("1M")[["count", "days"]].sum()
    .plot(figsize=(14, 7))
)

In [None]:
train_ping_df = ping_df.loc[
    lambda df: df[PingFeatures.datetime].dt.year == 2016, :
].loc[lambda df: (df[get_all_cols(PingFeatures.loc)].abs() < 90).all(axis=1), :]

In [None]:
r1 = 40
r2 = 120
min_staying_time = 300
max_time_between = 24 * 60 * 60
min_size = 2


work_start = 9
work_end = 17
home_arrive = 20
home_depart = 8


model = Infostop(
    r1=r1,
    r2=r2,
    min_staying_time=min_staying_time,
    max_time_between=max_time_between,
    min_size=min_size,
)

dayconf = DaySetup(work_start, work_end, home_arrive, home_depart)


In [None]:
out_df = proc_device_pings(train_ping_df, model, dayconf)

In [None]:
out_df

In [None]:
sem_locs = semantic_df.groupby(StopFeatures.info)[get_all_cols(StopFeatures.center)].mean()

In [None]:
sem_locs

In [None]:
place_df = out_df.loc[lambda df: df[StopFeatures.destination_label] != "-1",].assign(
    info=lambda df: df.groupby(StopFeatures.destination_label)[
        get_all_cols(StopFeatures.center)
    ]
    .mean()
    .assign(
        sem_address=lambda _df: [
            ((sem_locs - row.values.reshape(1, -1)) ** 2).sum(axis=1).idxmin()
            for i, row in _df.iterrows()
        ]
    )
    .loc[:, "sem_address"]
    .reindex(df[StopFeatures.destination_label].values)
    .values
)

In [None]:
place_df

In [None]:
place_df

In [None]:
get_career(place_df)

In [None]:
# https://www.mapequation.org/infomap/