In [58]:
import pandas as pd
import geopandas as gpd
from tqdm import tqdm_notebook

import os
from geoalchemy2 import WKTElement


import matplotlib.pyplot as plt
%matplotlib inline

data_path = ''

In [59]:
import warnings
warnings.filterwarnings("ignore")

## Read MAs

In [60]:
ma_gdf = gpd.read_file(os.path.join('data', 'mov_areas_enriched.geojson'))

In [61]:
ma_gdf.head()

Unnamed: 0,ID_GRUPO,Shape_Area,CPRO,NPRO,POB_GRUPO,LITERAL_GRUPO,geometry
0,001A,82378410.0,3,Alicante,7.903,"Montesinos, Los y Algorfa","POLYGON ((696312.177 4215979.057, 696312.181 4..."
1,001B,145293100.0,8,Barcelona,14.529,Sant Joan de Vilatorrada y otros municipios,"POLYGON ((898992.611 4638407.292, 898992.603 4..."
2,001M,475652600.0,28,Madrid,5.149,Fuentidueña de Tajo y otros municipios,"POLYGON ((494216.778 4445259.734, 494216.385 4..."
3,001S,1436231000.0,41,Sevilla,6.733,"Real de la Jara, El y otros municipios","POLYGON ((240629.599 4209416.214, 240629.597 4..."
4,001V,11264220.0,46,Valencia/Valéncia,6.277,Daimús y otros municipios,"POLYGON ((748392.315 4317155.513, 748392.289 4..."


In [62]:
ma_gdf.shape

(3214, 7)

In [63]:
ma_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   ID_GRUPO       3214 non-null   object  
 1   Shape_Area     3214 non-null   float64 
 2   CPRO           3214 non-null   int64   
 3   NPRO           3214 non-null   object  
 4   POB_GRUPO      3214 non-null   float64 
 5   LITERAL_GRUPO  3214 non-null   object  
 6   geometry       3214 non-null   geometry
dtypes: float64(2), geometry(1), int64(1), object(3)
memory usage: 175.9+ KB


# Read Tweets

In [64]:
import datetime
e_date = datetime.datetime.strptime('2020-06-20', '%Y-%m-%d') #datetime.datetime.now()
i_date = datetime.datetime.strptime('2020-03-20', '%Y-%m-%d')

In [65]:
from datetime import date, timedelta

delta = e_date - i_date       # as timedelta

target_days = []
for i in range(delta.days + 1):
    day = i_date + timedelta(days=i)
    target_days.append(day)

### Point-based tweets

In [66]:
def map_point_tweets_with_target_areas_fn(gdf_t, target_areas_gdf):
    grilled_gdf = gpd.sjoin(gdf_t, target_areas_gdf, how="inner", op='intersects')
    grilled_gdf.drop(columns='index_right lat lon text'.split(),inplace=True)
    return grilled_gdf

### polygon based tweets

In [67]:
max_mob_area_shape = ma_gdf['Shape_Area'].quantile(q=0.99)
max_mob_area_shape

2239553275.937497

In [68]:
def map_centroid_tweets_with_target_areas_fn(gdf_t, target_areas_gdf):
    grilled_gdf = gpd.sjoin(gdf_t, target_areas_gdf, how="inner", op='intersects')
    if 'text' in grilled_gdf.columns:
        grilled_gdf.drop(columns='index_right text'.split(),inplace=True)
    else:
        grilled_gdf.drop(columns='index_right'.split(),inplace=True)
    return grilled_gdf

In [69]:
def read_polygon_tweet_file_fn(file_path):   
    gdf_ = gpd.read_file(file_path, parse_dates=['timestamp'], driver='GeoJSON', encoding='utf-8').to_crs({'init': 'epsg:25830'})
    gdf_['centroid']= gdf_['geometry'].apply(lambda x: x.centroid)
    gdf_= gdf_.drop('geometry', 1)
    gdf_= gdf_.rename(columns={'centroid':'geometry'})
        
    return gdf_

In [70]:
def calculate_twt_stats():
    global_n_tweets =0 
    global_n_users= set()
    for d in tqdm_notebook(target_days):
        #point-based tweets
        file_path = os.path.join(data_path, 'Spain-TWT-dataset_march_july_2020', 'point_tweets_{}.geojson'.format(d.strftime('%d-%m-%Y')))
        point_gdf = None
        if os.path.exists(file_path):
            gdf = gpd.read_file(file_path, parse_dates=['timestamp'], driver='GeoJSON', encoding='utf-8').to_crs({'init': 'epsg:25830'})

            global_n_tweets += gdf.shape[0]
            global_n_users = global_n_users.union(set(gdf['user_id'].unique().tolist()))

        #tweets with polygon geometry
        file_path = os.path.join(data_path,  'Spain-TWT-dataset_march_july_2020', 'poly_tweets_{}.geojson'.format(d.strftime('%d-%m-%Y')))

        poly_gdf= None
        if os.path.exists(file_path):
            gdf = gpd.read_file(file_path, parse_dates=['timestamp'], driver='GeoJSON', encoding='utf-8').to_crs({'init': 'epsg:25830'})

            global_n_tweets += gdf.shape[0]
            global_n_users = global_n_users.union(set(gdf['user_id'].unique().tolist()))

    return global_n_tweets, global_n_users

In [71]:
global_n_tweets, global_n_users= calculate_twt_stats()

HBox(children=(IntProgress(value=0, max=93), HTML(value='')))

Sheer number of tweets

In [72]:
global_n_tweets

8210773

Sheer number of unique tweets

In [73]:
len(global_n_users)

190100

--------------------------------

# Perform the mapping process

In [57]:
for d in tqdm_notebook(target_days):
    #point-based tweets
    file_path = os.path.join(data_path, 'Spain-TWT-dataset_march_july_2020', 'point_tweets_{}.geojson'.format(d.strftime('%d-%m-%Y')))
    point_gdf = None
    if os.path.exists(file_path):
        gdf = gpd.read_file(file_path, parse_dates=['timestamp'], driver='GeoJSON', encoding='utf-8').to_crs({'init': 'epsg:25830'})
        
        global_n_tweets += gdf.shape[0]
        global_n_users = global_n_users.union(set(gdf['user_id'].unique().tolist()))
        
        point_gdf = map_point_tweets_with_target_areas_fn(gdf, ma_gdf)
        
    #tweets with polygon geometry
    file_path = os.path.join(data_path,  'Spain-TWT-dataset_march_july_2020', 'poly_tweets_{}.geojson'.format(d.strftime('%d-%m-%Y')))
    
    poly_gdf= None
    if os.path.exists(file_path):
        gdf = read_polygon_tweet_file_fn(file_path)
        
        global_n_tweets += gdf.shape[0]
        global_n_users = global_n_users.union(set(gdf['user_id'].unique().tolist()))
        
        poly_gdf = map_centroid_tweets_with_target_areas_fn(gdf, ma_gdf)


    #merge together the two dataframes
    if (point_gdf is not None) and (poly_gdf is not None):
        gdf_ = pd.concat([point_gdf, poly_gdf] , axis=0)
        ordered_gdf = gdf_.sort_values(by='timestamp', ascending=True)
        #ordered_gdf.to_file(os.path.join('data','tw_in_ma_{}.geojson'.format(d.strftime('%Y_%m_%d'))), driver="GeoJSON")
    elif point_gdf is not None:
        point_gdf = point_gdf.sort_values(by='timestamp', ascending=True)
        #point_gdf.to_file(os.path.join('data','tw_in_ma_{}.geojson'.format(d.strftime('%Y_%m_%d'))), driver="GeoJSON")
    elif poly_gdf is not None:
        poly_gdf = poly_gdf.sort_values(by='timestamp', ascending=True)
        #poly_gdf.to_file(os.path.join('data','tw_in_ma_{}.geojson'.format(d.strftime('%Y_%m_%d'))), driver="GeoJSON")

HBox(children=(IntProgress(value=0, max=93), HTML(value='')))

KeyboardInterrupt: 

In [None]:
print("That's all folks")