In [20]:
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import geopandas as gpd
import numpy as np
from tqdm import tqdm_notebook
from shapely.geometry import Point, LineString
import os

from datetime import datetime, timedelta

import matplotlib.pyplot as plt
%matplotlib inline

import folium
import geoplot


data_path = '/home/fterroso/data/'
tweets_path = '/home/fterroso/projects/twitter-crawler/streaming_tweets/'

Read geodataframe generated in 3d

In [21]:
prov_trips_gdf = gpd.read_file(os.path.join('data', 'province_consecutive_enriched.geojson'), 
                          driver='GeoJSON', 
                          encoding='utf-8').to_crs({'init': 'epsg:25830'})

In [22]:
prov_trips_gdf.head()

Unnamed: 0,user_id,tw_id_o,timestamp_o,PROV_o,tw_id_d,timestamp_d,PROV_d,dist_km,time_length,speed_km_h,geometry
0,550261599,1271251707662487553,2020-06-12T01:23:20,Lleida,1271300070848073728,2020-06-12T04:35:31,Madrid,385.841286,11531,120.460379,LINESTRING (801799.4858383657 4613560.58154440...
1,550261599,1235822698179080192,2020-03-06T07:01:06,Barcelona,1235848984700149760,2020-03-06T08:45:33,Madrid,509.802161,6267,292.849494,"LINESTRING (936098.348589811 4599731.93930922,..."
2,550261599,1263866722945134598,2020-05-22T16:18:02,Madrid,1263890111114051585,2020-05-22T17:50:59,Girona,594.25245,5577,383.594911,LINESTRING (441970.6228863231 4474288.59112672...
3,550261599,1249736667495370753,2020-04-13T16:30:15,Madrid,1249757472350285825,2020-04-13T17:52:55,Murcia,373.985274,4960,271.440924,LINESTRING (441970.6228863231 4474288.59112672...
4,550261599,1254589140030550017,2020-04-27T01:52:14,Madrid,1254636101488107521,2020-04-27T04:58:51,Barcelona,505.474929,11197,162.517616,LINESTRING (441970.6228863231 4474288.59112672...


In [23]:
prov_trips_gdf['timestamp_o']= pd.to_datetime(prov_trips_gdf['timestamp_o'])
prov_trips_gdf['timestamp_d']= pd.to_datetime(prov_trips_gdf['timestamp_d'])
prov_trips_gdf['time_length']= prov_trips_gdf['time_length'].apply(lambda x: timedelta(seconds=x))

In [24]:
prov_trips_gdf.head()

Unnamed: 0,user_id,tw_id_o,timestamp_o,PROV_o,tw_id_d,timestamp_d,PROV_d,dist_km,time_length,speed_km_h,geometry
0,550261599,1271251707662487553,2020-06-12 01:23:20,Lleida,1271300070848073728,2020-06-12 04:35:31,Madrid,385.841286,03:12:11,120.460379,LINESTRING (801799.4858383657 4613560.58154440...
1,550261599,1235822698179080192,2020-03-06 07:01:06,Barcelona,1235848984700149760,2020-03-06 08:45:33,Madrid,509.802161,01:44:27,292.849494,"LINESTRING (936098.348589811 4599731.93930922,..."
2,550261599,1263866722945134598,2020-05-22 16:18:02,Madrid,1263890111114051585,2020-05-22 17:50:59,Girona,594.25245,01:32:57,383.594911,LINESTRING (441970.6228863231 4474288.59112672...
3,550261599,1249736667495370753,2020-04-13 16:30:15,Madrid,1249757472350285825,2020-04-13 17:52:55,Murcia,373.985274,01:22:40,271.440924,LINESTRING (441970.6228863231 4474288.59112672...
4,550261599,1254589140030550017,2020-04-27 01:52:14,Madrid,1254636101488107521,2020-04-27 04:58:51,Barcelona,505.474929,03:06:37,162.517616,LINESTRING (441970.6228863231 4474288.59112672...


Read province file generated in notebook 2a

In [25]:
province_gdf = gpd.read_file(os.path.join('data', 'provinces_with_code.geojson'), encoding='utf-8').to_crs({'init': 'epsg:25830'})
province_gdf.head()

Unnamed: 0,NATCODE,NAMEUNIT,CODEUNIT,geometry
0,34104600000,València/Valencia,46,(POLYGON ((653037.6610938488 4429500.236400451...
1,34084500000,Toledo,45,"POLYGON ((294238.4652867644 4416957.42816667, ..."
2,34024400000,Teruel,44,"POLYGON ((765333.0112961649 4512252.350615062,..."
3,34094300000,Tarragona,43,(POLYGON ((813560.4348489635 4504876.753000001...
4,34074200000,Soria,42,"POLYGON ((583890.5889343297 4603311.305096235,..."


---------------------------

In [26]:
time_periods = {'P1':(0,5),
               'P2':(6,9),
               'P3':(10,16),
               'P4':(17,23),}

In [27]:
prov_trips_gdf.head(3)

Unnamed: 0,user_id,tw_id_o,timestamp_o,PROV_o,tw_id_d,timestamp_d,PROV_d,dist_km,time_length,speed_km_h,geometry
0,550261599,1271251707662487553,2020-06-12 01:23:20,Lleida,1271300070848073728,2020-06-12 04:35:31,Madrid,385.841286,03:12:11,120.460379,LINESTRING (801799.4858383657 4613560.58154440...
1,550261599,1235822698179080192,2020-03-06 07:01:06,Barcelona,1235848984700149760,2020-03-06 08:45:33,Madrid,509.802161,01:44:27,292.849494,"LINESTRING (936098.348589811 4599731.93930922,..."
2,550261599,1263866722945134598,2020-05-22 16:18:02,Madrid,1263890111114051585,2020-05-22 17:50:59,Girona,594.25245,01:32:57,383.594911,LINESTRING (441970.6228863231 4474288.59112672...


In [28]:
prov_trips_gdf['hour'] = prov_trips_gdf['timestamp_o'].apply(lambda x: x.hour)

In [29]:
import datetime

def generate_OD_matrix_fn(trips_df, 
                          time_period=None,
                          i_date=datetime.datetime.strptime('2020-02-27', '%Y-%m-%d'), 
                          e_date = datetime.datetime.now()):
    
    t_trips = trips_df[(trips_df['timestamp_o']>= i_date)& (trips_df['timestamp_o']<= e_date)]    
    
    
    if time_period:
        t_trips['hour'] = t_trips['timestamp_o'].apply(lambda x: x.hour)
        h_range = time_periods[time_period]
        t_trips = t_trips[(t_trips['hour']>= h_range[0])& (t_trips['hour']<= h_range[1])]    

    print("Number of evaluated trips: {}".format(t_trips.shape[0]))
    
    trips_od_df = t_trips.groupby('PROV_o PROV_d'.split()).agg(['count'])
    sum_ = trips_od_df.sum(level=0, axis=0)
    norm_trips_od_df= trips_od_df.div(sum_, axis=1, level=0)
    
    trips_od_df = trips_od_df[trips_od_df.columns[0][0]]
    norm_trips_od_df= norm_trips_od_df[norm_trips_od_df.columns[0][0]]
    
    trips_od_df['n_trips_norm'] = norm_trips_od_df['count']
    trips_od_df.rename(columns={'count':'n_trips'},inplace=True)
        
    
    return trips_od_df
    """
    norm_trips_od_df= trips_od_df.div(sum_, axis=1, level=0)
    return {'od_matrix': trips_od_df[trips_od_df.columns[0][0]], 'norm_od_matrix': norm_trips_od_df[norm_trips_od_df.columns[0][0]]}
    """

## Intra and inter-province trips

In [30]:
intra_inter_prov_OD = generate_OD_matrix_fn(prov_trips_gdf)
intra_inter_prov_OD.head()

Number of evaluated trips: 18034


Unnamed: 0_level_0,Unnamed: 1_level_0,n_trips,n_trips_norm
PROV_o,PROV_d,Unnamed: 2_level_1,Unnamed: 3_level_1
A Coruña,Alacant/Alicante,1,0.005618
A Coruña,Almería,1,0.005618
A Coruña,Araba/Álava,1,0.005618
A Coruña,Asturias,2,0.011236
A Coruña,Barcelona,8,0.044944


## Only inter-province trips

Global

In [31]:
inter_prov_trips_df = prov_trips_gdf[prov_trips_gdf['PROV_o']!=prov_trips_gdf['PROV_d']]
inter_prov_OD = generate_OD_matrix_fn(inter_prov_trips_df)

Number of evaluated trips: 18034


In [32]:
inter_prov_OD.to_csv(os.path.join('data','global_tw_od.csv'))

Global - hour periods

In [33]:
for p in time_periods.keys():
    inter_prov_OD = generate_OD_matrix_fn(inter_prov_trips_df, time_period=p)
    inter_prov_OD.to_csv(os.path.join('data','global_tw_od_{}.csv'.format(p)))

Number of evaluated trips: 2162
Number of evaluated trips: 4546
Number of evaluated trips: 8173
Number of evaluated trips: 3153


Only July

In [34]:
july_inter_prov_OD = generate_OD_matrix_fn(
    inter_prov_trips_df,
    i_date=datetime.datetime.strptime('2020-07-01', '%Y-%m-%d'))

Number of evaluated trips: 3758


In [35]:
july_inter_prov_OD.to_csv(os.path.join('data','july_tw_od_normalized.csv'))

Only July- hour periods

In [36]:
for p in time_periods.keys():
    july_inter_prov_p_OD = generate_OD_matrix_fn(inter_prov_trips_df, 
                                                 time_period=p, 
                                                 i_date=datetime.datetime.strptime('2020-07-01', '%Y-%m-%d'))
    july_inter_prov_p_OD.to_csv(os.path.join('data','july_tw_od_{}.csv'.format(p)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Number of evaluated trips: 393
Number of evaluated trips: 1060
Number of evaluated trips: 1757
Number of evaluated trips: 548


Pre-lockdown

In [37]:
inter_prov_OD = generate_OD_matrix_fn(
    inter_prov_trips_df,
    e_date=datetime.datetime.strptime('2020-03-12', '%Y-%m-%d'))


Number of evaluated trips: 3772


Post-lockdown

In [38]:
inter_prov_OD = generate_OD_matrix_fn(
    inter_prov_trips_df,
    i_date=datetime.datetime.strptime('2020-06-21', '%Y-%m-%d'))


Number of evaluated trips: 5261
