In [1]:
import pandas as pd
import geopandas as gpd
from geopandas.tools import sjoin
import numpy as np
from shapely.geometry import Point, LineString
from tqdm import tqdm_notebook
import math
from geoalchemy2 import Geometry, WKTElement
from sqlalchemy import *
import psycopg2

  """)


In [2]:
data_path = '/home/fterroso/data/'
tweets_path = '/home/fterroso/projects/twitter-crawler/streaming_tweets/'

In [3]:
import datetime
#e_date = datetime.datetime.now()
e_date =  datetime.datetime.strptime('2020-08-01', '%Y-%m-%d')
i_date = datetime.datetime.strptime('2020-07-22', '%Y-%m-%d')

In [4]:
import json

def read_tweets_from_file_fn(f):
    geo_point_tweets = []
    geo_poly_tweets_info = []
    geo_poly_tweets_geom = []

    #for file in tqdm_notebook(filenames, desc="Reading files"):
    #    filepath= dataPath + file
    with open(f) as fp:
        line = fp.readline()
        while line:
            tw = json.loads(line)
            
            #point-based coordinates
            if tw['coordinates']:
                lon = tw['coordinates']['coordinates'][0]
                lat = tw['coordinates']['coordinates'][1]
                if (lat > 35.86) & (lat < 43.74) & (lon > -9.57) &  (lon < 4.39):
                    geo_point_tweets.append((tw['id'], tw['user']['id'],tw['text'], tw['created_at'], lon, lat))
                    #geo_tag_tweets.append(tweet)
                    
            #polygon-based coordinates        
            elif tw['place']['country_code']=='ES':
                
                geo_poly_tweets_info.append((tw['id'], tw['user']['id'],
                                        tw['text'], 
                                        tw['created_at'], 
                                        tw['place']['full_name'], 
                                        tw['place']['place_type']))
                
                geo_poly_tweets_geom.append(tw['place']['bounding_box'])
            
            line = fp.readline()
    
    return geo_point_tweets, geo_poly_tweets_info, geo_poly_tweets_geom

In [5]:
import os
def search_for_files(listOfFactorNames, path):
    for f in os.listdir(path):
        for factor in listOfFactorNames:
            if factor in f:
                yield f
                break

In [6]:
def convert_point_tweets_fn(p_tweets):
    df = pd.DataFrame.from_records(p_tweets, columns='tw_id user_id text timestamp lon lat'.split())
    gdf = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=gpd.points_from_xy(df.lon, df.lat))
    
    return gdf

In [7]:
from shapely.geometry import Polygon

def convert_polygon_tweets_fn(poly_tweets_info, poly_tweets_geom):
    poly_geom = []
    
    #convert bounding box to shapely polygons
    for bbox in poly_tweets_geom:

        coords_list = bbox['coordinates'][0]
        coords_tuple = [tuple(c) for c in coords_list]
        polygon = Polygon(coords_tuple)
        #print(coords_list)
        poly_geom.append(polygon)

    df = pd.DataFrame.from_records(poly_tweets_info, columns='tw_id user_id text timestamp place_name place_type'.split())
    gdf = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=poly_geom)

    return gdf

In [8]:
from datetime import date, timedelta


def convert_tweets_to_geodf_fn():

    delta = e_date - i_date       # as timedelta

    print("Processing days:")
    target_days = []
    for i in range(delta.days + 1):
        day = i_date + timedelta(days=i)
        day_str= day.strftime('%d-%m-%Y')
        #print(day_str)
        files = list(search_for_files([day_str], tweets_path))
        #print(files)
        if len(files)>0:
            print(day_str, end=', ')
            point_tweets = []
            poly_tweets_info= []
            poly_tweets_bbox= []
            for f in files:
                point_tweets_f, poly_tweets_info_f, poly_tweets_bbox_f = read_tweets_from_file_fn(tweets_path + f)
                point_tweets = point_tweets + point_tweets_f
                poly_tweets_info = poly_tweets_info + poly_tweets_info_f
                poly_tweets_bbox = poly_tweets_bbox + poly_tweets_bbox_f

            poly_tweets_gdf = convert_polygon_tweets_fn(poly_tweets_info, poly_tweets_bbox)
            point_tweets_gdf = convert_point_tweets_fn(point_tweets)

            poly_tweets_gdf.to_file("data/poly_tweets_{}.geojson".format(day_str), driver='GeoJSON', encoding='utf-8')
            point_tweets_gdf.to_file("data/point_tweets_{}.geojson".format(day_str), driver='GeoJSON', encoding='utf-8')

In [9]:
convert_tweets_to_geodf_fn()

Processing days:
22-07-2020, 23-07-2020, 24-07-2020, 25-07-2020, 26-07-2020, 27-07-2020, 28-07-2020, 29-07-2020, 30-07-2020, 31-07-2020, 01-08-2020, 