In [315]:
import pandas as pd
import numpy as np

In [316]:
# %load ../helpers.py
import datetime
import json

def dated_filename (fn, ext='.csv'):
    today = datetime.date.today()
    return '{}-{}{}'.format(fn, today, ext)

def load_json (fn):
    with open(fn, 'r') as myfile:
        data=myfile.read()
    return json.loads(data)


In [317]:
ooni_observations = pd.read_csv('data/ooni-observations-2019-06-20.csv')

In [352]:
counts = ooni_observations.groupby('probe_cc').count()
counts[counts.index=='GF']

Unnamed: 0_level_0,Unnamed: 0,download_url,index,probe_asn,test_name,test_start_time
probe_cc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GF,1,1,1,1,1,1


NOTE: We may not see every country on every reading. That's okay. The countries we *do* see will still be more or less similar to each other. By increasing our "net" (scrape size), we can increase our statistical power.


In [353]:
users_by_country = pd.read_csv('../shared/analysis/internet-users-by-country/internet-users-country-2019-06-20.csv')
users_by_country.head()

Unnamed: 0.1,Unnamed: 0,Country Code,Country Name,people-online
0,0,ABW,Aruba,10228500.0
1,1,AFG,Afghanistan,406737300.0
2,2,AGO,Angola,427077900.0
3,3,ALB,Albania,206449400.0
4,4,AND,Andorra,7609640.0


In [320]:
country_codes = pd.read_csv('../shared/data/country-codes/countries_codes_and_coordinates.csv')

In [354]:
def alpha2_from_alpha3 (alpha3):
    code = country_codes[country_codes['Alpha-3 code'].str.contains(alpha3)]['Alpha-2 code']
    return code.values[0].split('"')[1]

def alpha3_from_alpha2 (alpha2):
    code = country_codes[country_codes['Alpha-2 code'].str.contains(alpha2)]['Alpha-3 code']
    return code.values[0].split('"')[1]

# alpha2_from_alpha3('ABW')
alpha3_from_alpha2('GF')

'GUF'

In [380]:
pd.isnull(users_by_country[users_by_country['Country Code']=='GUF'])

Unnamed: 0.1,Unnamed: 0,Country Code,Country Name,people-online


In [404]:
results = []
for alpha2, count in counts.iterrows():
    num_incidents = count[0]
    # ignore readings from unknown countries
    if alpha2 != 'ZZ' and alpha2 != 'EU' and pd.notna(alpha2)\
        and alpha2 != 'GF' and alpha2 != 'GP' and alpha2 != 'MQ' and alpha2 != 'RE' and alpha2 != 'TW': 
        # also we have no population info for a few countries
        alpha3 = alpha3_from_alpha2(alpha2)
        reading = users_by_country[users_by_country['Country Code']==alpha3]
        people_online = reading['people-online'].values[0]
        interference_incidents_weighted = num_incidents/people_online
        results += [{
            'country': reading['Country Name'].values[0],
            'alpha-2': alpha2, 
            'interference-incidents-weighted': interference_incidents_weighted,
        }]
        
results = pd.DataFrame(results)

In [405]:
results.to_csv(dated_filename('analysis/interference-incidents-weighted-by-num-internet-users'))

In [406]:
results.sort_values(by='interference-incidents-weighted')

Unnamed: 0,alpha-2,country,interference-incidents-weighted
156,VN,Vietnam,1.266926e-09
32,CN,China,1.487754e-09
127,RS,Serbia,2.025182e-09
160,ZM,Zambia,2.100330e-09
96,MM,Myanmar,3.664540e-09
129,RW,Rwanda,3.762962e-09
131,SD,Sudan,3.995921e-09
161,ZW,Zimbabwe,4.472026e-09
75,JP,Japan,4.686926e-09
67,IE,Ireland,4.918468e-09
