In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

import dask

import os

import matplotlib.pyplot as plt
%matplotlib inline

data_path = '/home/fterroso/data/'

  shapely_geos_version, geos_capi_version_string


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import datetime
e_date = datetime.datetime.strptime('2020-06-20', '%Y-%m-%d') #datetime.datetime.now()
i_date = datetime.datetime.strptime('2020-03-22', '%Y-%m-%d')

from datetime import date, timedelta

delta = e_date - i_date       # as timedelta

target_days = []
for i in range(delta.days + 1):
    day = i_date + timedelta(days=i)
    target_days.append(day)

## 1. Generate raw trips

In [4]:
def generate_trips_in_dates_fn(dates):
    
    gdf_1 = gpd.read_file(os.path.join('data','tw_in_ma_{}.geojson'.format(dates[0].strftime('%Y_%m_%d'))), parse_dates=['timestamp'], driver="GeoJSON")
    gdf_2 = gpd.read_file(os.path.join('data','tw_in_ma_{}.geojson'.format(dates[1].strftime('%Y_%m_%d'))), parse_dates=['timestamp'], driver="GeoJSON")
                
    df_ = pd.concat([gdf_1, gdf_2] , axis=0) 
    
    tw_user_ids = df_['user_id'].unique()

    user_trips = []

    
    for u in tw_user_ids.tolist():
        u_gdf = df_[df_['user_id']==u]

        n_rows = u_gdf.shape[0]

        if n_rows>1:
            locs = np.arange(0,n_rows)
            for i,j in zip(locs, locs[1:]):
                to = u_gdf.iloc[i]
                td = u_gdf.iloc[j]

                if to['ID_GRUPO'] != td['ID_GRUPO']: 

                    user_trips.append((to['user_id'],
                                       #time_length,
                                       to['tw_id'], 
                                       to['timestamp'], 
                                       to['CPRO'],
                                       to['NPRO'],
                                       to['ID_GRUPO'],
                                       to['LITERAL_GRUPO'],   
                                       to['POB_GRUPO'],
                                       to['geometry'],
                                       td['tw_id'], 
                                       td['timestamp'], 
                                       td['CPRO'],
                                       td['NPRO'],
                                       td['ID_GRUPO'],
                                       td['LITERAL_GRUPO'],
                                       td['POB_GRUPO'],
                                       td['geometry']))
    return user_trips

In [5]:
from dask.distributed import Client, progress
client = Client(threads_per_worker=4, n_workers=10)
client

0,1
Client  Scheduler: tcp://127.0.0.1:33843  Dashboard: http://127.0.0.1:44943/status,Cluster  Workers: 10  Cores: 40  Memory: 135.09 GB


In [8]:
lazy_results= []
for i_date,f_date in zip(target_days, target_days[1:]):
    lazy_result = dask.delayed(generate_trips_in_dates_fn)((i_date,f_date))
    lazy_results.append(lazy_result)

In [9]:
futures = dask.persist(*lazy_results) 

In [10]:
client.cluster.scale(10) 

In [11]:
results = dask.compute(*futures)

In [12]:
len(results)

90

In [13]:
flat_list = [item for sublist in results for item in sublist]

In [14]:
trips_df = pd.DataFrame.from_records(flat_list, columns='user_id tw_id_o timestamp_o CPRO_o NPRO_O CGRUPO_o NGRUPO_o POB_GRUPO_o geom_o tw_id_d timestamp_d CPRO_d NPRO_d CGRUPO_d NGRUPO_d POB_GRUPO_d geom_d'.split())

In [15]:
trips_df.head()

Unnamed: 0,user_id,tw_id_o,timestamp_o,CPRO_o,NPRO_O,CGRUPO_o,NGRUPO_o,POB_GRUPO_o,geom_o,tw_id_d,timestamp_d,CPRO_d,NPRO_d,CGRUPO_d,NGRUPO_d,POB_GRUPO_d,geom_d
0,1258006388,1242014584019529728,Mon Mar 23 09:05:27 +0000 2020,41,Sevilla,084S,Sevilla (SCD Number 4-A),18.048,POINT (241155.7030033822 4141397.433281829),1242190058716835842,Mon Mar 23 20:42:43 +0000 2020,41,Sevilla,025S,Alcalá del Río,12.029,POINT (238412.4162063967 4160382.205794079)
1,1258006388,1242190058716835842,Mon Mar 23 20:42:43 +0000 2020,41,Sevilla,025S,Alcalá del Río,12.029,POINT (238412.4162063967 4160382.205794079),1242239109504151564,Mon Mar 23 23:57:38 +0000 2020,41,Sevilla,084S,Sevilla (SCD Number 4-A),18.048,POINT (241155.7030033822 4141397.433281829)
2,231380112,1241989424113684481,Mon Mar 23 07:25:28 +0000 2020,17,Girona,25GI,Besalú y otros municipios,5.231,POINT (970453.5231286159 4687892.360277078),1242064620078104578,Mon Mar 23 12:24:16 +0000 2020,17,Girona,37GI,Banyoles,19.826,POINT (977549.3669210005 4679816.684196526)
3,231380112,1242070404300226574,Mon Mar 23 12:47:15 +0000 2020,17,Girona,37GI,Banyoles,19.826,POINT (977549.3669210005 4679816.684196526),1242155003655196674,Mon Mar 23 18:23:25 +0000 2020,17,Girona,25GI,Besalú y otros municipios,5.231,POINT (970453.5231286159 4687892.360277078)
4,1201548956,1242065686043688967,Mon Mar 23 12:28:30 +0000 2020,8,Barcelona,097B,Parets del Vallès,19.082,POINT (936126.6582087477 4614643.045649199),1242078693377409025,Mon Mar 23 13:20:11 +0000 2020,8,Barcelona,030B,Castellterçol y otros municipios,6.165,POINT (921480.9852253482 4635413.947512716)


In [16]:
trips_df.to_csv(os.path.join('data', 'raw_trips.csv'))

In [17]:
trips_df.shape

(348289, 17)

In [20]:
trips_df[trips_df['CGRUPO_o'].str.contains('CÑ')]

Unnamed: 0,user_id,tw_id_o,timestamp_o,CPRO_o,NPRO_O,CGRUPO_o,NGRUPO_o,POB_GRUPO_o,geom_o,tw_id_d,timestamp_d,CPRO_d,NPRO_d,CGRUPO_d,NGRUPO_d,POB_GRUPO_d,geom_d
577,430990029,1241779193714786304,Sun Mar 22 17:30:05 +0000 2020,15,"Coruña, A",80CÑ,Santiago de Compostela (distrito 05),29.935,POINT (46967.71956829401 4762455.573577675),1241780625176133633,Sun Mar 22 17:35:46 +0000 2020,15,"Coruña, A",79CÑ,Santiago de Compostela (distrito 04),27.464,POINT (48425.40954271855 4762550.915876452)
578,430990029,1241780625176133633,Sun Mar 22 17:35:46 +0000 2020,15,"Coruña, A",79CÑ,Santiago de Compostela (distrito 04),27.464,POINT (48425.40954271855 4762550.915876452),1241782458758152193,Sun Mar 22 17:43:04 +0000 2020,15,"Coruña, A",80CÑ,Santiago de Compostela (distrito 05),29.935,POINT (46967.71956829401 4762455.573577675)
909,1207700559437991936,1242065249794228226,Mon Mar 23 12:26:46 +0000 2020,15,"Coruña, A",75CÑ,Santa Comba,9.426,POINT (27522.18273098975 4781346.519581038),1242065591407706114,Mon Mar 23 12:28:08 +0000 2020,15,"Coruña, A",06CÑ,"Val do Dubra y Baña, A",7.326,POINT (39104.66730818296 4778805.205026471)
910,1207700559437991936,1242065591407706114,Mon Mar 23 12:28:08 +0000 2020,15,"Coruña, A",06CÑ,"Val do Dubra y Baña, A",7.326,POINT (39104.66730818296 4778805.205026471),1242134948766789639,Mon Mar 23 17:03:44 +0000 2020,15,"Coruña, A",75CÑ,Santa Comba,9.426,POINT (27522.18273098975 4781346.519581038)
1088,1111654830,1241771761768828930,Sun Mar 22 17:00:33 +0000 2020,15,"Coruña, A",77CÑ,Santiago de Compostela (distrito 02),15.630,POINT (50066.82238617858 4765237.500385303),1242085299041243136,Mon Mar 23 13:46:26 +0000 2020,36,Pontevedra,18PO,"Estrada, A",20.479,POINT (51714.89522708505 4739848.664553064)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348221,1122116556006264832,1274434396574101506,Sat Jun 20 20:10:12 +0000 2020,15,"Coruña, A",75CÑ,Santa Comba,9.426,POINT (27522.18273098975 4781346.519581038),1274434884577222657,Sat Jun 20 20:12:08 +0000 2020,15,"Coruña, A",85CÑ,Zas,4.472,POINT (18594.92307254175 4787413.754752998)
348222,1122116556006264832,1274434884577222657,Sat Jun 20 20:12:08 +0000 2020,15,"Coruña, A",85CÑ,Zas,4.472,POINT (18594.92307254175 4787413.754752998),1274472163916034049,Sat Jun 20 22:40:17 +0000 2020,15,"Coruña, A",75CÑ,Santa Comba,9.426,POINT (27522.18273098975 4781346.519581038)
348231,270637461,1274406074850906114,Sat Jun 20 18:17:40 +0000 2020,15,"Coruña, A",28CÑ,Boiro,18.838,POINT (17292.9442288467 4740625.26531064),1274433992356487168,Sat Jun 20 20:08:36 +0000 2020,15,"Coruña, A",66CÑ,Padrón,8.384,POINT (39511.36662905441 4749552.230806322)
348232,270637461,1274433992356487168,Sat Jun 20 20:08:36 +0000 2020,15,"Coruña, A",66CÑ,Padrón,8.384,POINT (39511.36662905441 4749552.230806322),1274435368021118976,Sat Jun 20 20:14:04 +0000 2020,36,Pontevedra,04PO,Valga y Catoira,9.204,POINT (36356.24772749023 4741868.947244653)


In [18]:
print('That all folks')

That all folks
