In [1]:
import pandas as pd
import geopandas as gpd
from geopandas.tools import sjoin
import numpy as np
from tqdm import tqdm_notebook

In [2]:
import os
tweets_path = os.path.join('data','TWT','raw')

In [3]:
tweets_path

'data/TWT/raw'

In [4]:
import datetime
e_date = datetime.datetime.strptime('2022-03-15', '%Y-%m-%d')
i_date = datetime.datetime.strptime('2022-03-01', '%Y-%m-%d')

In [5]:
import json

def read_tweets_from_file_fn(f):
    geo_point_tweets = []
    geo_poly_tweets_info = []
    geo_poly_tweets_geom = []

    #for file in tqdm_notebook(filenames, desc="Reading files"):
    #    filepath= dataPath + file
    with open(f, encoding='ascii') as fp:
        #print(f, end=" ")
        line = fp.readline()
        while line:
            try:
                tw = json.loads(line)
            except:
                print("Corrupted tweet")
                line = fp.readline()
                continue

            
            #point-based coordinates
            if tw['coordinates']:
                lon = tw['coordinates']['coordinates'][0]
                lat = tw['coordinates']['coordinates'][1]
                if (lat > 35.86) & (lat < 42.99) & (lon > -9.57) &  (lon < 4.39):
                    geo_point_tweets.append((tw['id'], tw['user']['id'],tw['text'], tw['created_at'], lon, lat))
                    #geo_tag_tweets.append(tweet)
                    
            #polygon-based coordinates        
            elif tw['place']['country_code']=='ES':
                
                geo_poly_tweets_info.append((tw['id'], tw['user']['id'],
                                        tw['text'], 
                                        tw['created_at'], 
                                        tw['place']['full_name'], 
                                        tw['place']['place_type']))
                
                geo_poly_tweets_geom.append(tw['place']['bounding_box'])
            
            line = fp.readline()
    
    return geo_point_tweets, geo_poly_tweets_info, geo_poly_tweets_geom

In [6]:
import os
def search_for_files(listOfFactorNames, path):
    for f in os.listdir(path):
        for factor in listOfFactorNames:
            if factor in f:
                yield f
                break

In [7]:
def convert_point_tweets_fn(p_tweets):
    df = pd.DataFrame.from_records(p_tweets, columns='tw_id user_id text timestamp lon lat'.split())
    gdf = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=gpd.points_from_xy(df.lon, df.lat))
    
    return gdf

In [8]:
from shapely.geometry import Polygon

def convert_polygon_tweets_fn(poly_tweets_info, poly_tweets_geom):
    poly_geom = []
    
    #convert bounding box to shapely polygons
    for bbox in poly_tweets_geom:

        coords_list = bbox['coordinates'][0]
        coords_tuple = [tuple(c) for c in coords_list]
        polygon = Polygon(coords_tuple)
        #print(coords_list)
        poly_geom.append(polygon)

    df = pd.DataFrame.from_records(poly_tweets_info, columns='tw_id user_id text timestamp place_name place_type'.split())
    gdf = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=poly_geom)

    return gdf

In [9]:
from datetime import date, timedelta


def convert_tweets_to_geodf_fn():

    delta = e_date - i_date       # as timedelta

    print("Processing days:")
    target_days = []
    for i in tqdm_notebook(range(delta.days + 1)):
        day = i_date + timedelta(days=i)
        day_str= day.strftime('%d-%m-%Y')
        files = list(search_for_files([day_str], tweets_path))
        if len(files)>0:
            print(day_str, end=', ')
            point_tweets = []
            poly_tweets_info= []
            poly_tweets_bbox= []
            for f in files:
                point_tweets_f, poly_tweets_info_f, poly_tweets_bbox_f = read_tweets_from_file_fn(os.path.join(tweets_path, f))
                point_tweets = point_tweets + point_tweets_f
                poly_tweets_info = poly_tweets_info + poly_tweets_info_f
                poly_tweets_bbox = poly_tweets_bbox + poly_tweets_bbox_f

            poly_tweets_gdf = convert_polygon_tweets_fn(poly_tweets_info, poly_tweets_bbox)
            point_tweets_gdf = convert_point_tweets_fn(point_tweets)

            day_for_file= day.strftime('%Y-%m-%d')

            poly_tweets_gdf.to_file("data/TWT/processed/poly_tweets_{}.geojson".format(day_for_file), driver='GeoJSON', encoding='utf-8')
            point_tweets_gdf.to_file("data/TWT/processed/point_tweets_{}.geojson".format(day_for_file), driver='GeoJSON', encoding='utf-8')

In [10]:
convert_tweets_to_geodf_fn()

Processing days:


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(delta.days + 1)):


  0%|          | 0/15 [00:00<?, ?it/s]

01-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


02-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


03-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


04-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


06-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


07-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


08-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


09-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


10-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


11-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


13-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


14-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


15-03-2022, 

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
