## Notebook Magic

In [None]:
%matplotlib inline
%load_ext autoreload

## Imports

In [None]:
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.width = 0

### TODO: Do we need values from other movement codes
### TODO: What about multiple registers (21) at the same day?
### TODO: What about cases where multiple exits (31, 21) after each other?
### TODO: What about cases where exit is successful vocational training?
### In newer times these people just go to 'busy'

## Get Pedidos Table

In [None]:
df = pd.read_parquet("s3://iefp-unemployment/intermediate/clean/pedidos.parquet")

In [None]:
df = df.sort_values(["ute_id", "data_movimento"])

In [None]:
end_count = len(df[df["tipo_movimento"].isin([21, 31])])
start_count = len(df[df["tipo_movimento"] == 11])
print(start_count)
print(end_count)

In [None]:
df_dem = df.copy()
df_dem = df_dem[df_dem["tipo_movimento"] == 11]
df_dem["data_movimento"] = df_dem["data_movimento"].dt.date
df_dem = df_dem.groupby(["ute_id", "data_movimento"]).first()
print(df_dem.shape)
df_dem = df_dem.drop_duplicates()
print(df_dem.shape)
df_dem.describe(include='all')

In [None]:
df_t = df.loc[df["tipo_movimento"].isin(["11", "21", "31"]),
              ["ute_id", "data_movimento", "tipo_movimento", "motivo_inscricao",
               "motivo_anulacao"]].copy()

df_t["journey_start"] = df_t["tipo_movimento"] == 11
df_t['journey_count'] = df_t.groupby("ute_id")["journey_start"].cumsum()
df_t["tipo_movimento"] = df_t["tipo_movimento"].astype(np.int64).astype('str')

# Note: What about multiple registers (21) at the same day?
# Note: What about cases where multiple exits (31, 21) after each other?
# Note: What about cases where exit is successful vocational training? In newer times these people just go to 'busy'
df_t = df_t.pivot_table(index = ['ute_id', 'journey_count'], columns = 'tipo_movimento', aggfunc=np.min)
df_t.columns = ['_'.join(col).strip() for col in df_t.columns.values]
df_t = df_t.reset_index()

In [None]:
cols = ['ute_id',
 'journey_count',
 'data_movimento_11',
 'data_movimento_21',
 'data_movimento_31',
 'motivo_anulacao_31',
 'motivo_inscricao_11',
]
t_cols = ['user_id', 'journey_count', 'register_date', 'exit_date_21',
          'exit_date_31', 'exit_reason', 'register_reason']
df_t = df_t[cols]
df_t.columns = t_cols

In [None]:
df_t.head(20)

### Multiple types of exit codes (21 and 31) and same day registration

In [None]:
print(df_t.shape)
df_t = df_t[df_t["journey_count"] > 0]
df_t = df_t[(df_t["exit_date_21"].notna()) & (df_t["exit_date_31"].notna())]
print(df_t.shape)

In [None]:
df_t.head(20)
df_t['user_id'].nunique()

In [None]:
df_t.head(20)

#### Register reason 91: REINSCRIÇÃO COMO EMPREGADO NA SEQUÊNCIA DE COLOCAÇÃO - CANDIDATURA EXTERNA
#### Register reason 91: REINSTATEMENT AS AN EMPLOYEE FOLLOWING PLACEMENT - EXTERNAL APPLICATION

##### Check with IEFP. Possible explanation: successful placement is recorded as a register and then successful exit.

##### Can't see placements or course though.. might be a mistake.

### Testing Nova journey definition
we define as an entry a registration (movement 11) or a change in category from employment/busy to unemployment (a movement 43 from a categoria 3, 4, 5 to categoria 1, 2) (edited) 
and we define an exit was a registration cancellation (mov 31), a successful presentation (mov 21) or a change in category to categoria 3, 4, or 5
but only if the exit is not followed by a re-entry within 28 days

"User's category code:
1 - Unemployed first job
2 - Unemployed new job
3 - Employed
4 - Employed part-time
5 - Busy
6 - Unemployed first job, unavailable to work
7 - Unemployed, new job, unavailable to work
8 - Employed, unavailable to work"

In [None]:
df['categoria'].value_counts()

In [None]:
df = df.sort_values(["ute_id", "data_movimento"], ascending=True)

In [None]:
df.head()

In [None]:
df_small = df.iloc[0:100]

In [None]:
user_cat = df[df['tipo_movimento'] == 43]['ute_id'].unique()
len(user_cat)
journeys = []

In [None]:
user_cat = user_cat[-202000:-200000]

#### Create list of movement and category tuples

In [None]:
# Check dates of user journeys

for user in user_cat[0:200]:
    user_table = df[df['ute_id'] == user]
    print(user_table.iloc[-1, 1])

In [None]:
for user in user_cat:
    user_table = df[df['ute_id'] == user]
    user_journey = []
    for index, row in user_table.iterrows():
        user_journey.append((row['tipo_movimento'], row['categoria']))
    journeys.append(user_journey)

In [None]:
for journey in journeys[0:50]:
    print(journey)

#### How many journeys end in a 43?

In [None]:
end_state = []

In [None]:
len(journeys)

In [None]:
for journey in journeys:
    if journey[-1][0] == 43:
        end_state.append(journey)
print(len(end_state))

Out of 2000 journeys, 444 end in a 43.

In [None]:
for journey in journeys:
    if journey[-1][0] == 43:
        end_state.append(journey[-1])

In [None]:
len(end_cat)

In [None]:
end_cat = [journey[1] for journey in end_state]

In [None]:
all_cat = []
for journey in journeys:
    for pair in journey:
        all_cat.append(pair[1])
len(all_cat)

In [None]:
sns.distplot(all_cat,kde = False)
plt.xticks([1,2,3,4,5])
plt.title("Categoria at Category Change - Mov 43")
plt.show()

In [None]:
sns.distplot(end_cat,kde = False)
plt.show()

In [None]:
end_state[0:100]

#### Sense check individual journeys to see patterns in exit codes/category changes

In [None]:
df[df['categoria'] == 3].iloc[6000:]

In [None]:
df[df['ute_id'] == 14456]

In [None]:
df[df['ute_id'] == 29494]

In [None]:
df[df['ute_id'] == 43283]

In [None]:
df[df['ute_id'] == 42968]

In [None]:
df[df['ute_id'] == 43392]

In [None]:
df[df['ute_id'] == 60767]

In [None]:
df[df['ute_id'] == 60372]

In [None]:
df[df['ute_id'] == 74636]

In [None]:
display(df[df['ute_id'] == 74470])

In [None]:
display(df[df['ute_id'] == 91085])

In [None]:
display(df[df['ute_id'] == 90858])