# Generation of the Candidate Flows of Human Mobility

In [10]:
import pandas as pd
import numpy as np

import os
import datetime
from datetime import date, timedelta
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
%matplotlib inline

data_path = os.path.join('/','home','fterroso','data','Spain-INE-mobility-study-2020','muestra1')

### Target Mobility Areas (MAs) 

In [11]:
target_regions_ids= ['3003001','3003002','3003003']

## General functions

In [12]:
# %load 00_general_functions.py
#!/usr/bin/env python

# In[1]:


def generate_days_fn(init_date, final_date):
    
    delta = final_date - init_date       # as timedelta

    target_days = []
    for i in range(delta.days + 1):
        day = init_date + timedelta(days=i)
        target_days.append(day)
    
    return target_days


# In[2]:


def read_INE_trips_date_fn(date_, trips_type='all', flow_type= 'all'):
    date_str= date_.strftime('%Y%m%d')
    df_date= pd.read_csv(os.path.join(data_path, f'{date_str}_maestra_1_mitma_distrito.txt'), 
                             sep='|',dtype={'origen':str, 'destino':str,'fecha':str, 'periodo':str})
    
    if flow_type== 'incoming':
        df_date = df_date[df_date['destino'].isin(target_regions_ids)]
    elif flow_type == 'outgoing':
        df_date = df_date[df_date['origen'].isin(target_regions_ids)]
    elif flow_type== 'all':
        df_date = df_date[(df_date['destino'].isin(target_regions_ids)) |
                          (df_date['origen'].isin(target_regions_ids))]
    
    if trips_type=='inter':
        df_date= df_date[df_date['origen']!=df_date['destino']] #only keep trips between areas
    elif trips_type=='intra':
        df_date= df_date[df_date['origen']==df_date['destino']] #only keep trips within the areas
    
    
    
    #convert period column to a two-digit string
    df_date['periodo'] = df_date['periodo'].apply(lambda x: x.zfill(2))
    df_date= df_date.fillna(0) # set nan as 0
    return df_date



Target time period

In [13]:
time_period= 'sept_oct'
to_date = datetime.datetime.strptime('2020-10-31', '%Y-%m-%d')
from_date = datetime.datetime.strptime('2020-09-15', '%Y-%m-%d')

See shared excel file with criteria definitions

In [14]:
criteria = {
    'criterion_1' : {'destino':['3003001'], 'distancia': ['010-050','050-100', '100+'], 'actividad_destino':['trabajo']},
    'criterion_2': {'destino':['3003002'], 'distancia': ['010-050','050-100', '100+'], 'actividad_destino':['trabajo']},
    'criterion_3': {'origen':['3003001'],'destino':['3003001'], 'distancia': ['002-005','005-010','010-050','050-100', '100+'], 'actividad_destino':['trabajo']},
    'criterion_4': {'origen':['3003002'],'destino':['3003002'], 'distancia': ['002-005','005-010','010-050','050-100', '100+'], 'actividad_destino':['trabajo']},
    'criterion_5': {'destino':['3003001'], 'distancia': ['005-010','010-050','050-100', '100+'], 'actividad_origen':['casa'], 'actividad_destino':['trabajo']},
    'criterion_6': {'destino':['3003002'], 'distancia': ['005-010','010-050','050-100', '100+'], 'actividad_origen':['casa'], 'actividad_destino':['trabajo']},
    'criterion_7': {'origen':['3003001'], 'distancia': ['010-050','050-100', '100+'], 'actividad_destino':['casa']},
    'criterion_8': {'origen':['3003002'], 'distancia': ['010-050','050-100', '100+'], 'actividad_destino':['casa']},
    'criterion_9' : {'destino':['3003001'], 'distancia': ['010-050','050-100', '100+']},
    'criterion_10': {'origen':['3003001'], 'distancia': ['010-050','050-100', '100+']}
}


In [15]:
def generate_timestamp(row):
    f = row['fecha']
    h = row['periodo']
    d= datetime.datetime.strptime(r'{} {}'.format(f,h), '%Y%m%d %H')
    return d

In [16]:
def generate_subflow_fn(from_date, to_date, criterion):
    dfs= []
    target_days = generate_days_fn(from_date, to_date)
    for date in tqdm_notebook(target_days, leave= False):
        df = read_INE_trips_date_fn(date)
        for k in criterion:
            df = df[df[k].isin(criterion[k])]
        dfs.append(df)
    
    criterion_df = pd.concat(dfs,axis=0)
    sum_trips_c_df= criterion_df.groupby(['fecha','periodo']).agg({'viajes':['sum']})
    sum_trips_c_df= sum_trips_c_df.reset_index()
    sum_trips_c_df.columns= sum_trips_c_df.columns.droplevel(1)
    sum_trips_c_df['timestamp'] = sum_trips_c_df.apply(generate_timestamp, axis=1)
    sum_trips_c_df = sum_trips_c_df.set_index('timestamp')
    
    
    # Index with the hour-based time series
    index = from_date + pd.to_timedelta(np.arange(len(target_days)*24), 'h')
    df_ = pd.DataFrame(np.nan,index=index, columns=['n_viajes'])
    df_['n_viajes'] = sum_trips_c_df['viajes']

    return df_

In [17]:
flows = []
for c in tqdm_notebook(criteria):
    flow_df = generate_subflow_fn(from_date, to_date, criteria[c])
    flow_df.to_csv(os.path.join('data', 'INE_subflows','flow_{}_raw_{}.csv'.format(c, time_period)))
    flows.append(flow_df)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))




In [18]:
print("That's all folks")

That's all folks
