## set environment

In [None]:
pip install geopandas pygeos watermark

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pygeos
from tqdm import tqdm

  shapely_geos_version, geos_capi_version_string


In [3]:
%reload_ext watermark
%watermark -iv -a 'Dan Levine' -u -d -v

Author: Dan Levine

Last updated: 2022-08-22

Python implementation: CPython
Python version       : 3.7.13
IPython version      : 7.9.0

pandas   : 1.3.5
numpy    : 1.21.6
pygeos   : 0.12.0
geopandas: 0.10.2



## read in tracts geom

In [None]:
tracts_geom = gpd.read_file(
    'raw data/tracts_clipped.zip',
    ignore_fields=['STATEFP', 'COUNTYFP', 'TRACTCE', 'AFFGEOID','NAME', 'LSAD',
       'ALAND', 'AWATER'])

In [None]:
tracts_geom = tracts_geom.to_crs('EPSG:4326')

In [None]:
tracts_geom['GEOID'] = pd.to_numeric(tracts_geom['GEOID'])

## get, join, and process ookla data by year

In [None]:
years = [2019, 2020, 2021]
quarters = [1,2,3,4]

In [None]:
for year in years:
  
  year_data_list = []
  
  for quarter in tqdm(quarters):   

    ## to fetch directly from source, uncomment below and comment out local path
    #month = str((1 + (quarter-1)*3)).zfill(2)
    #quarter_data_path = f'https://ookla-open-data.s3.amazonaws.com/parquet/performance/type=fixed/year={year}/quarter={quarter}/{year}-{month}-01_performance_fixed_tiles.parquet'
    
    ## to load from local directory
    quarter_data_path = f'raw data/ookla/{year}-Q{quarter}.parquet'
    
    quarter_data = pd.read_parquet(quarter_data_path)

    # filter to North America
    quarter_data = quarter_data[quarter_data['quadkey'].str.slice(stop=2).isin(['00','01','02','03'])]

    year_data_list.append(quarter_data)
  
  year_data = pd.concat(year_data_list, ignore_index=True)

  year_data_geo = gpd.GeoDataFrame(
    data = year_data[['avg_d_kbps', 'avg_u_kbps', 'avg_lat_ms', 'tests','devices']],
    geometry = gpd.GeoSeries.from_wkt(
        year_data['tile'], 
        crs = 'EPSG:4326'
    )
  )

  year_tracts_tiles = tracts_geom.sjoin(year_data_geo)


  year_tracts_tiles = year_tracts_tiles.set_index('GEOID')
  year_tracts_tiles['total_tract_tests'] = year_tracts_tiles.groupby('GEOID')['tests'].sum()
  year_tracts_tiles['weighting_by_tests'] = year_tracts_tiles['tests'] / year_tracts_tiles['total_tract_tests'] 

  year_tracts_tiles = year_tracts_tiles.assign(
    weighted_avg_d_kbps = year_tracts_tiles['avg_d_kbps'] * year_tracts_tiles['weighting_by_tests'],
    weighted_avg_u_kbps = year_tracts_tiles['avg_u_kbps'] * year_tracts_tiles['weighting_by_tests'],
    weighted_avg_lat_ms = year_tracts_tiles['avg_lat_ms'] * year_tracts_tiles['weighting_by_tests']
    )
  
  year_weighted_results = year_tracts_tiles.groupby('GEOID')[['weighted_avg_d_kbps','weighted_avg_u_kbps','weighted_avg_lat_ms']].sum()
  year_weighted_results['total_tract_tests'] = year_tracts_tiles.groupby('GEOID')['total_tract_tests'].min()

  year_weighted_results.to_csv(f'processed data/{year}__ookla_by_tract.csv')
  print(f'___ saved: {year} ___')

100%|██████████| 4/4 [01:18<00:00, 19.60s/it]


___ saved: 2019 ___


100%|██████████| 4/4 [01:13<00:00, 18.33s/it]
