In [1]:
import pandas as pd
import numpy as np

In [2]:
# %load ../helpers.py
import datetime
import json

def dated_filename (fn, ext='.csv'):
    today = datetime.date.today()
    return '{}-{}{}'.format(fn, today, ext)

def load_json (fn):
    with open(fn, 'r') as myfile:
        data=myfile.read()
    return json.loads(data)


# Load OONI observations

Count the number of incidents by country

In [3]:
ooni_observations = pd.read_csv('data/ooni-observations-2019-06-20.csv')

In [4]:
counts = ooni_observations.groupby('probe_cc').count()
counts.head()

Unnamed: 0_level_0,Unnamed: 0,download_url,index,probe_asn,test_name,test_start_time
probe_cc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AD,1,1,1,1,1,1
AE,99,99,99,99,99,99
AF,7,7,7,7,7,7
AG,1,1,1,1,1,1
AL,28,28,28,28,28,28


NOTE: We may not see every country on every reading. That's okay. The countries we *do* see will still be more or less similar to each other. By increasing our "net" (scrape size), we can increase our statistical power.

# Incidents per online users

We'll find the number of interference incidents per estimated users online in a country.

First, let's find how many users are online in country.

In [5]:
users_by_country = pd.read_csv('../shared/analysis/internet-users-by-country/internet-users-country-2019-06-20.csv')
users_by_country.head()

Unnamed: 0.1,Unnamed: 0,Country Code,Country Name,people-online
0,0,ABW,Aruba,10228500.0
1,1,AFG,Afghanistan,406737300.0
2,2,AGO,Angola,427077900.0
3,3,ALB,Albania,206449400.0
4,4,AND,Andorra,7609640.0


## Convert from alpha2 to alpha3

OONI uses the alpha2 standard, and our population dataaset uses the alpha3 standard. We'll convert to make comparsions.

In [6]:
country_codes = pd.read_csv('../shared/data/country-codes/countries_codes_and_coordinates.csv')

In [9]:
def alpha2_from_alpha3 (alpha3):
    code = country_codes[country_codes['Alpha-3 code'].str.contains(alpha3)]['Alpha-2 code']
    return code.values[0].split('"')[1]

def alpha3_from_alpha2 (alpha2):
    code = country_codes[country_codes['Alpha-2 code'].str.contains(alpha2)]['Alpha-3 code']
    return code.values[0].split('"')[1]

alpha2_from_alpha3('ABW')

'AW'

## Find incidents per person online

Finally, we'll go through our counts of interference incidents per country, and divide that count by the approximate number of people online in that country.

In [12]:
results = []
for alpha2, count in counts.iterrows():
    num_incidents = count[0]
    # ignore readings from unknown countries
    if alpha2 != 'ZZ' and alpha2 != 'EU' and pd.notna(alpha2)\
        and alpha2 != 'GF' and alpha2 != 'GP' and alpha2 != 'MQ' and alpha2 != 'RE' and alpha2 != 'TW' and alpha2 != 'SS': 
        # also we have no population info for a few countries
        alpha3 = alpha3_from_alpha2(alpha2)
        reading = users_by_country[users_by_country['Country Code']==alpha3]
        people_online = reading['people-online'].values[0]
        interference_incidents_weighted = num_incidents/people_online
        results += [{
            'country': reading['Country Name'].values[0],
            'alpha-2': alpha2, 
            'interference-incidents-weighted': interference_incidents_weighted,
        }]
        
results = pd.DataFrame(results)

In [13]:
results.to_csv(dated_filename('analysis/interference-incidents-weighted-by-num-internet-users'))

In [14]:
results.sort_values(by='interference-incidents-weighted')

Unnamed: 0,alpha-2,country,interference-incidents-weighted
36,CN,China,3.692819e-09
17,BH,Bahrain,6.987819e-09
105,MM,Myanmar,8.550593e-09
167,VN,Vietnam,8.657330e-09
172,ZW,Zimbabwe,8.944052e-09
145,SL,Sierra Leone,9.996572e-09
136,RS,Serbia,1.012591e-08
42,CY,Cyprus,1.049971e-08
171,ZM,Zambia,1.050165e-08
82,JP,Japan,1.197770e-08
