In [7]:
import pandas as pd
import os
import datetime
from datetime import timedelta
from tqdm.notebook import tqdm


In [8]:
data_path=os.path.join('data', 'mobility')
i_date = datetime.datetime.strptime('2020-04-01', '%Y-%m-%d')
e_date = datetime.datetime.strptime('2020-10-31', '%Y-%m-%d')

In [9]:
def generate_days_fn(init_date, final_date):
    
    delta = final_date - init_date       # as timedelta

    target_days = []
    for i in range(delta.days + 1):
        day = init_date + timedelta(days=i)
        target_days.append(day)
    
    return target_days

In [10]:
def read_INE_trips_date_fn(date_, trips_type='all', criteria= None):
    date_str= date_.strftime('%Y%m%d')
    year_month_str = date_.strftime('%Y%m')
    df_date= pd.read_csv(os.path.join(data_path, f'{year_month_str}_maestra1_mitma_distrito', f'{date_str}_maestra_1_mitma_distrito.txt.gz'), 
                             sep='|',dtype={'origen':str, 'destino':str,'fecha':str, 'periodo':str}, compression='gzip')
    
    if trips_type=='inter':
        df_date= df_date[df_date['origen']!=df_date['destino']] #only keep trips between areas
    elif trips_type=='intra':
        df_date= df_date[df_date['origen']==df_date['destino']] #only keep trips within the areas
    
    if criteria is not None:
        for col,value in criteria.items():
            df_date = df_date[df_date[col]==value]
        
    #convert period column to a two-digit string
    df_date['periodo'] = df_date['periodo'].apply(lambda x: x.zfill(2))
    df_date= df_date.fillna(0) # set nan as 0
    return df_date

In [11]:
def generate_daily_time_series_fn(from_date, to_date, trips_type='all', filter_criteria= None):
    dfs = []
    target_days = generate_days_fn(from_date, to_date)
    for date in tqdm(target_days):
        df_= read_INE_trips_date_fn(date, trips_type, filter_criteria)
        dfs.append(df_)
    return pd.concat(dfs, axis=0)        

In [None]:
criteria_1= {'actividad_origen':'otros', 'actividad_destino':'otros', 'distancia':'100+'}
flow_oo = generate_daily_time_series_fn(i_date, e_date, 'all', criteria_1 )

criteria_2= {'actividad_origen':'casa', 'actividad_destino':'otros', 'distancia':'100+'}
flow_ho = generate_daily_time_series_fn(i_date, e_date, 'all', criteria_2)

criteria_3= {'actividad_origen':'otros', 'actividad_destino':'casa', 'distancia':'100+'}
flow_oh = generate_daily_time_series_fn(i_date, e_date, 'all', criteria_3)

  0%|          | 0/214 [00:00<?, ?it/s]

In [None]:
flow_oo.head()

In [None]:
ax= flow_oo.groupby('fecha')['viajes'].sum().plot( label='oo', legend=True);
flow_ho.groupby('fecha')['viajes'].sum().plot(ax=ax, label='ho', legend=True);
flow_oh.groupby('fecha')['viajes'].sum().plot(grid=True, ax=ax, label='oh', legend=True);

In [None]:
flow_o_o.head()

In [15]:
mob_data = pd.read_csv(os.path.join('data', 'mobility', '202003_maestra1_mitma_distrito', '20200302_maestra_1.txt',),
                      sep='|')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
def get_timestamp_fn(row):
    d = row['fecha']
    h = row['periodo']
    date_time_obj = datetime.datetime.strptime(f'{d} {h}', '%Y%m%d %H')
    return date_time_obj

In [23]:
#mob_data['date'] = mob_data.apply(get_timestamp_fn, axis=1)

In [16]:
mob_data.head()

Unnamed: 0,fecha,origen,destino,actividad_origen,actividad_destino,residencia,edad,periodo,distancia,viajes,viajes_km
0,20200302,01001_AM,01001_AM,casa,otros,1,,0,002-005,6.023,26.668
1,20200302,01001_AM,01001_AM,casa,otros,1,,1,005-010,12.238,96.666
2,20200302,01001_AM,01001_AM,casa,otros,1,,2,005-010,6.119,60.455
3,20200302,01001_AM,01001_AM,casa,otros,1,,2,010-050,4.431,56.484
4,20200302,01001_AM,01001_AM,casa,otros,1,,3,005-010,8.164,56.366


In [19]:
mob_data['distancia'].unique()

array(['002-005', '005-010', '010-050', '0005-002', '050-100', '100+'],
      dtype=object)

In [20]:
flow1 = mob_data[(mob_data['actividad_origen']=='otros')&
                 (mob_data['actividad_destino']=='otros') & 
                 (mob_data['distancia']=='100+')]
flow2 = mob_data[(mob_data['actividad_origen']=='casa')&
                 (mob_data['actividad_destino']=='otros')& 
                 (mob_data['distancia']=='100+')]
flow3 = mob_data[(mob_data['actividad_origen']=='otros')&
                 (mob_data['actividad_destino']=='casa')& 
                 (mob_data['distancia']=='100+')]

In [21]:
flow1.head()

Unnamed: 0,fecha,origen,destino,actividad_origen,actividad_destino,residencia,edad,periodo,distancia,viajes,viajes_km
1106,20200302,01001_AM,06060_AM,otros,otros,21,,1,100+,8.72,4681.82
1107,20200302,01001_AM,06095_AM,otros,otros,8,,7,100+,4.406,2711.512
1108,20200302,01001_AM,06153,otros,otros,3,,0,100+,6.192,3226.328
1109,20200302,01001_AM,0820505,otros,otros,28,,6,100+,4.729,1925.504
1112,20200302,01001_AM,09056,otros,otros,40,,10,100+,2.321,236.609


In [25]:
flow1.groupby('fecha')['viajes'].count()

fecha
20200302    72387
Name: viajes, dtype: int64