## Notebook Magic

In [None]:
%matplotlib inline
%load_ext autoreload

## Imports

In [None]:
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

## Get Pedidos Table

In [None]:
df = pd.read_parquet("s3://iefp-unemployment/intermediate/clean/pedidos.parquet")

## Transform

In [None]:
df.shape

In [None]:
df = df[["ute_id",
"data_movimento",
"tipo_movimento",
"motivo_inscricao",
"motivo_anulacao",
"estado",
"categoria",
]]
df = df[df.tipo_movimento.isin([11, 21, 31, 43])]

In [None]:
df = df.sort_values(["ute_id", "data_movimento"], ascending=True)

In [None]:
df.head()

In [None]:
df["start_journey"] = (
    ((df.tipo_movimento == 11) & df.categoria.isin([1, 2]))
    | ((df.tipo_movimento == 11) & df.motivo_inscricao == 17)
    | ((df.tipo_movimento == 43) & df.categoria.isin([1, 2]))
    | ((df.tipo_movimento == 43) & df.motivo_inscricao == 17)
)
df["end_journey"] = (
    (df.tipo_movimento == 21)
    | (df.tipo_movimento == 31)
    | ((df.tipo_movimento == 43) & df.categoria.isin([3, 4]))
)

df.loc[df.start_journey == True, "journey"] = "start"
df.loc[df.end_journey == True, "journey"] = "end"
df["journey_count"] = df.groupby("ute_id")["end_journey"].cumsum() - df["end_journey"] + 1
df = df.drop(["start_journey", "end_journey"], axis='columns')

In [None]:
df_j = df.pivot_table(index=["ute_id", "journey_count"], columns="journey", aggfunc="first")

In [None]:
df_j.columns = ["_".join(col).strip() for col in df_j.columns.values]
df_j = df_j.reset_index()
print(df_j.shape)

In [None]:
df_j.head()

In [None]:
df_j = df_j[df_j.data_movimento_end.notna() & df_j.data_movimento_start.notna()]
print(df_j.shape)

In [None]:
print(df_j.shape)
df_j = df_j[df_j.estado_start == "ACT"]
print(df_j.shape)

In [None]:
df_j = df_j[
    [
        "ute_id",
        "journey_count",
        "data_movimento_start",
        "data_movimento_end",
        "tipo_movimento_start",
        "tipo_movimento_end",
        "categoria_start",
        "categoria_end",
        "motivo_inscricao_start",
        "motivo_anulacao_end",
    ]
]
df_j.columns = [
    "user_id",
    "journey_count",
    "register_date",
    "exit_date",
    "register_movement",
    "exit_movement",
    "register_category",
    "exit_category",
    "register_reason",
    "exit_reason",
]
df_j.head()

### Recount journeys

In [None]:
df_j.journey_count = 1
df_j["journey_count"] = df_j.groupby(["user_id"])["journey_count"].cumsum()
df_j.head()

In [None]:
df_j.head()

## Check special cases

#### One-day journeys

In [None]:
df_j[df_j.register_date.dt.date == df_j.exit_date.dt.date].shape

In [None]:
df_j[df_j.register_date.dt.date == df_j.exit_date.dt.date].head()

#### End/Start at same day

In [None]:
df_j["prev_exit"] = df_j.shift(1)["exit_date"]

In [None]:
df_j[df_j.prev_exit.dt.date == df_j.register_date.dt.date].shape

In [None]:
df_j[df_j.prev_exit.dt.date == df_j.register_date.dt.date].head()

In [None]:
df[df.ute_id == 604]

## Journey stats

In [None]:
df_j.groupby("user_id")["journey_count"].max().value_counts().head(10)

In [None]:
df_j.groupby("user_id")["journey_count"].max().mean()

In [None]:
df_j.groupby("user_id")["journey_count"].max().median()

## Filter data

In [None]:
print(df_t.shape)
df_t = df_t[df_t["journey_count"] > 0]
print(df_t.shape)
df_t = df_t[(df_t["data_movimento_21"].notna()) | (df_t["data_movimento_31"].notna())]
print(df_t.shape)

## Transform with dates

In [None]:
df_t = transform(df, "first")
df_t.head()

In [None]:
df_t[df_t["data_movimento_11"].dt.date == df_t["data_movimento_21"].dt.date]