In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.models import Span
from shapely.geometry import Point
import geopandas as gpd
import glob
from datetime import datetime
import bokeh
from bokeh.layouts import Row, column, gridplot
from bokeh.models import Title, Legend, TapTool, Range1d, TabPanel, Tabs
import matplotlib as mpl

from bokeh.core.validation.warnings import EMPTY_LAYOUT, MISSING_RENDERERS
bokeh.core.validation.silence(EMPTY_LAYOUT, True)
bokeh.core.validation.silence(MISSING_RENDERERS, True)

import dask.dataframe as dd
from sklearn.preprocessing import StandardScaler
import dask_geopandas as dg




In [2]:
color_palette = [  '#4E79A7',  # Blue
    '#F28E2B',  # Orange
    '#E15759',  # Red
    '#76B7B2',  # Teal
    '#59A14F',  # Green
    '#EDC948',  # Yellow
    '#B07AA1',  # Purple
    '#FF9DA7',  # Pink
    '#9C755F',  # Brown
    '#BAB0AC',  # Gray
    '#7C7C7C',  # Dark gray
    '#6B4C9A',  # Violet
    '#D55E00',  # Orange-red
    '#CC61B0',  # Magenta
    '#0072B2',  # Bright blue
    '#329262',  # Peacock green
    '#9E5B5A',  # Brick red
    '#636363',  # Medium gray
    '#CD9C00',  # Gold
    '#5D69B1',  # Medium blue
]

# Ookla's Number of Speedtest Intelligence Users

Ookla Speedtests are a way for users to find out the download speed, upload speed and latency of their internet connection. This is a dataset of the number of users who took the speedtest. To account for change, we take the baseline number of users from November 2022 and show the percent change in daily users and change in weekly users. 

In [3]:
# extract all speedtest intelligence files
ios = pd.DataFrame()
stdesktop = pd.DataFrame()
android = pd.DataFrame()
stnet = pd.DataFrame()


android = dd.read_csv(glob.glob(f'../../data/ookla-speedtest/raw/android_*.csv'), low_memory=False, usecols = ['test_id', 'test_date', 'client_latitude', 'client_longitude', 'client_country'])
android['type'] = 'android'
ios = dd.read_csv(glob.glob(f'../../data/ookla-speedtest/raw/ios_*.csv'), low_memory=False, usecols = ['test_id', 'test_date', 'client_latitude', 'client_longitude', 'client_country'])
ios['type'] = 'ios'
stnet = dd.read_csv(glob.glob(f'../../data/ookla-speedtest/raw/stnet_*.csv'), low_memory=False, usecols = ['test_id', 'test_date', 'client_latitude', 'client_longitude', 'client_country'])
stnet['type'] = 'stnet'
stdesktop = dd.read_csv(glob.glob(f'../../data/ookla-speedtest/raw/stdesktop_*.csv'), low_memory=False, usecols = ['test_id', 'test_date', 'client_latitude', 'client_longitude', 'client_country'])
stdesktop['type'] = 'stdesktop'

In [5]:
# choose only the records where the records are from November 2022 till date and are taken in Turkey
ddf = dd.concat([android, ios, stnet, stdesktop])
ddf = ddf[ddf['client_country']=='Syria']
ddf['test_date'] = dd.to_datetime(ddf['test_date'])
meta = ('test_date', 'datetime64[ns]')
#df['test_date'] = df['test_date'].apply(lambda x: x + timedelta(hours=3), meta=meta)
ddf = ddf[(ddf['test_date']>'2022-11-01')&(ddf['test_date']<'2023-03-16')]

# create a column for the number of 
#ddf['user_count'] = 1
ddf['date'] = ddf['test_date'].dt.date

In [6]:
# count the number of test_ids that were taken on a given day
ddf = ddf[['test_id', 'date', 'client_latitude', 'client_longitude', 'type']].groupby(['client_latitude','client_longitude', 'date', 'type']).count()[['test_id']].reset_index()

In [26]:
gddf = dg.from_dask_dataframe(
    ddf,
    geometry=dg.points_from_xy(ddf, "client_longitude", "client_latitude"),
).set_crs("EPSG:4326")

In [13]:
# read turkey shapefile and areas of interest
syria_adm3 = gpd.read_file('../../data/shapefiles/syr_pplp_adm4_unocha_20210113/syr_admbnda_adm3_uncs_unocha_20201217.json')
aoi = gpd.read_file('../../data/SYRTUR_tessellation.geojson')
# affected_adm1 = list(syria_adm3[syria_adm3['adm1'].isin(aoi['ADM1_PCODE'].unique())]['adm1_en'].unique())
affected_adm2 = list(syria_adm3[syria_adm3['ADM2_PCODE'].isin(aoi['ADM2_PCODE'].unique())]['ADM2_EN'].unique())

In [22]:
syria_adm3 = syria_adm3.drop(columns= ['Shape_Leng', 'Shape_Area', 'ADM3_AR',
       'ADM3_REF', 'ADM3ALT1EN', 'ADM3ALT2EN', 'ADM3ALT1AR', 'ADM3ALT2AR', 'ADM2_AR', 'ADM1_AR',
       'ADM0_EN', 'ADM0_AR', 'ADM0_PCODE', 'date', 'validOn', 'validTo'])

In [27]:
gddf = gddf.sjoin(syria_adm3)
gddf = gddf.drop(['client_latitude', 'client_longitude'], axis=1)

# add up all the test ids that were taken for that admin 2 level
gddf = gddf.groupby(['ADM2_EN', 'ADM1_EN', 'date', 'ADM3_EN']).sum().reset_index()



In [28]:
ooklaUsers = gddf.compute()
ooklaUsers['date'] = pd.to_datetime(ooklaUsers['date'])

In [30]:
ooklaUsers.rename(columns = {'test_id': 'n_users'}, inplace=True)
ooklaUsers = ooklaUsers[['ADM1_EN', 'ADM2_EN', 'ADM3_EN','date', 'n_users']]

In [31]:
ooklaUsers['date'] = pd.to_datetime(ooklaUsers['date'])
#ooklaUsers = ooklaUsers.groupby(['adm2_en', 'date']).sum('n_users').reset_index()
ooklaUsers = ooklaUsers[(ooklaUsers['date']>'2023-01-03')&(ooklaUsers['date']<'2023-03-13')]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ooklaUsers['date'] = pd.to_datetime(ooklaUsers['date'])


In [32]:
baseline = ooklaUsers[ooklaUsers["date"].between("2022-11-01", "2023-11-30")]

In [34]:
scalers = {}

for adm3 in baseline["ADM3_EN"].unique():
    scaler = StandardScaler()
    scaler.fit(baseline[baseline["ADM3_EN"] == adm3][["n_users"]])

    scalers[adm3] = scaler

In [35]:
baseline = baseline.groupby(["ADM3_EN"]).agg({"n_users": ["mean", "std"]})
baseline.columns = baseline.columns.map(".".join)

In [36]:
ooklaUsersChange = ooklaUsers.merge(baseline, on = ['ADM3_EN'], how = 'left')

In [37]:
ooklaUsersChange['n_baseline'] = ooklaUsersChange['n_users.mean']
ooklaUsersChange["n_difference"] = ooklaUsersChange["n_users"] - ooklaUsersChange["n_baseline"]
ooklaUsersChange["percent_change"] = 100 * (ooklaUsersChange["n_users"] / (ooklaUsersChange["n_baseline"]) - 1)

In [38]:
for adm2, scaler in scalers.items():
    try:
        predicate = ooklaUsersChange["ADM3_EN"] == adm2
        activity = scaler.transform(ooklaUsersChange[predicate][["n_users"]])
        ooklaUsersChange.loc[predicate, "z_score"] = activity
    except:
        pass

In [39]:
#week['week_date'] = df['date'] - pd.to_timedelta(7, unit='d')

# #calculate sum of values, grouped by week
week = ooklaUsersChange.groupby([pd.Grouper(key='date', freq='W-MON'), 'ADM1_EN', 'ADM2_EN', 'ADM3_EN']).mean().reset_index()

#week['user_count'] = week['user_count'].apply(lambda x: round(x,0))

In [41]:
bokeh.core.validation.silence(EMPTY_LAYOUT, True)

def get_line_plot(ooklaUsers,title, source, earthquakes=False, subtitle=None, measure = 'percent_change'):

    p2 = figure(x_axis_type = 'datetime', width = 800, height = 400, toolbar_location='above')
    p2.add_layout(Legend(), "right")

    for id, adm2 in enumerate(ooklaUsers['ADM3_EN'].unique()):
        df = ooklaUsers[ooklaUsers['ADM3_EN']==adm2][['date', measure]].reset_index(drop=True)
        p2.line(df['date'], df[measure], line_width=2, line_color = color_palette[id], legend_label=adm2)

    p2.legend.click_policy='hide'
    if subtitle is not None:
        p2.title = subtitle


    title_fig = figure(title=title, toolbar_location=None,width=800, height=40, )
    title_fig.title.align = "left"
    title_fig.title.text_font_size = "20pt"
    title_fig.border_fill_alpha = 0
    title_fig.outline_line_width=0

    #with silence(MISSING_RENDERERS):
    sub_title = figure(title=source, toolbar_location=None,width=800, height=40, )
    sub_title.title.align = "left"
    sub_title.title.text_font_size = "10pt"
    sub_title.title.text_font_style="normal"
    sub_title.border_fill_alpha = 0
    sub_title.outline_line_width=0

    layout = column(title_fig, p2, sub_title)

    if earthquakes:
        p2.renderers.extend([
        Span(
            location=datetime(2023, 2, 6),
            dimension="height",
            line_color='#7C7C7C',
            line_width=2,
            line_dash=(4,4)
      ),
        Span(
            location=datetime(2023, 2, 20),
            dimension="height",
            line_color='#7C7C7C',
            line_width=2,
            line_dash=(4,4)
        ),
    ]
)

    return layout

In [42]:
output_notebook()

tabs = []

for adm in affected_adm2:
    df = ooklaUsersChange[ooklaUsersChange['ADM2_EN']==adm] 

    tabs.append(
        TabPanel(
        child=get_line_plot(df, f"Number of daily users taking speedtest", "Source: Ookla Data for Good", earthquakes=True, subtitle = '% change compared to a 3 month prior baseline'),
                    title=adm.capitalize(),
                )
                )

tabs = Tabs(tabs=tabs, sizing_mode="scale_both")
show(tabs, warn_on_missing_glyphs=False)

In [44]:
output_notebook()


tabs = []

for adm in affected_adm2:
    df = week[week['ADM2_EN']==adm] 

    tabs.append(
        TabPanel(
        child=get_line_plot(df, f"Weekly average users taking speedtest", "Source: Ookla Data for Good", earthquakes=True, subtitle = '% change compared to a 3 month prior baseline'),
                    title=adm.capitalize(),
                )
                )

tabs = Tabs(tabs=tabs, sizing_mode="scale_both")
show(tabs, warn_on_missing_glyphs=False)