In [374]:
import pandas as pd
import json
import pycountry # Will probably require pip install pycountry
import datetime
import dateutil.parser as dparser
import collections

In [None]:
# Getting a list of countries
def get_countries():
    lst = []
    for x in pycountry.countries:
        lst.append(x.name)
    return lst
countries = get_countries()
countries

In [163]:
# Opening JSON file and loading the data
# into the variable data
json_path = 'weekly_surveillance_stats.json'
with open(json_path) as json_file:
    data = json.load(json_file)

PREPROCESSING

In [412]:
'''
Changes list of dictionaries to just a singular 
dictionary with following format
{
    VARIANT : COUNT
    B.1 : 2,
    B.1.7 : 5
}
'''
def convert_var_dict(lst_counts_variants):
    
    variant_dict = {}
    for curr_dic in lst_counts_variants:
        variant_dict[curr_dic['value']] = curr_dic['count']
    return variant_dict

'''
create_entry makes an entry for the sorted country table with
WEEK break down and then further breakdown into variant strain and count

Entry format example below
{
    WEEK_STAMP : VARIANT_DATA
    "01-2020" : varient_dict,
    "02-2020" : varient_dict,
    "03-2020" : varient_dict,
}
    
'''

def create_entry(date, lst_counts_variants):
    var_dict = convert_var_dict(lst_counts_variants)
    entry = {
            date : var_dict
        }
    return entry

# Add new week date and variant info to entry 
def add_entry(entry, date, lst_counts_variants):
    var_dict = convert_var_dict(lst_counts_variants)
    entry[date] = var_dict
    return entry        


# Add new week date and variant info to entry 
def add_entry_cummulative(entry, prev_date, curr_date, lst_cnts_vars, country):
    try:
        cummalitive_dict = entry[prev_date]
    except KeyError as e:
        cummalitive_dict = {}
        
    var_dict = convert_var_dict(lst_cnts_vars)

    # Add two dictionaries together
    dict1_cntr = collections.Counter(cummalitive_dict)
    dict2_cntr = collections.Counter(var_dict)
    new_dict = dict1_cntr + dict2_cntr
    entry[curr_date] = dict(new_dict) 
    return entry   


In [413]:
'''
Converts data to be indexed by country

Example:

{
    COUNTRY: {DATE : {VARIANT : COUNT}}
    Iceland : 
        {
            01-2020 :
                {
                    b.1 : 3
                    b.1.7 : 4
                }
            02-2020 : 
                {
                    b.1 : 3
                }
        },
    Spain : ...,
    ...
}
'''
def convert_to_country_idx(data):
     # Setting up range parameters for Data processing
    date_obj = dparser.parse(data['created']).date() # Getting the date that the data set was created (This is a parameter in json)
    curr_week = date_obj.isocalendar()[1] # Determining the week it was created to accuractely parse through data
    data_range = {
        2020 : range(10, 53), # Data ranges from weeks 10 to 52 (End of year) in 2020
        2021 : range(0, curr_week) # Avaible data ranges from start of year to 1 week before current week
    }

    country_sorted_data = {}
    # Looping through all the data by year, then by week
    for year in data_range.keys():
        for week in data_range[year]:
            # Access the country data for that week
            date = ("%02d" % week) + "-" + str(year)
            df = pd.DataFrame.from_dict(data['stats'][date], orient="index", columns = ['submissions_per_lineage', 'submissions'])   
            '''
            Run this in sepereate cell for an example of what df looks like
            df = pd.DataFrame.from_dict(data['stats']['12-2020'], orient="index", columns = ['submissions_per_lineage', 'submissions'])
            df
            '''
            # Convert country data to new format
            for country, submissions in df.iterrows():
                var_data = submissions["submissions_per_lineage"]
                if country in country_sorted_data:
                    country_sorted_data[country] = add_entry(country_sorted_data[country], date, var_data)
                else:
                    entry = create_entry(date, var_data)
                    country_sorted_data[country] = entry
    return country_sorted_data

In [414]:
def convert_to_country_idx_cummalitive(data):
     # Setting up range parameters for Data processing
    date_obj = dparser.parse(data['created']).date() # Getting the date that the data set was created (This is a parameter in json)
    curr_week = date_obj.isocalendar()[1] # Determining the week it was created to accuractely parse through data
    data_range = {
        2020 : range(10, 53), # Data ranges from weeks 10 to 52 (End of year) in 2020
        2021 : range(0, curr_week) # Avaible data ranges from start of year to 1 week before current week
    }

    country_sorted_data = {}
    # Looping through all the data by year, then by week
    for year in data_range.keys():
        prev_date = ''
        for week in data_range[year]:
            # Access the country data for that week
            date = ("%02d" % week) + "-" + str(year)
            df = pd.DataFrame.from_dict(data['stats'][date], orient="index", columns = ['submissions_per_lineage', 'submissions'])  
            # Convert country data to new format
            for country, submissions in df.iterrows():
                var_data = submissions["submissions_per_lineage"]
                if country in country_sorted_data:
                    country_sorted_data[country] = add_entry_cummulative(country_sorted_data[country], prev_date, date, var_data, country)
                else:
                    entry = create_entry(date, var_data)
                    country_sorted_data[country] = entry
            prev_date = date
    return country_sorted_data

In [415]:
final = convert_to_country_idx(data)

In [416]:
final2 = convert_to_country_idx_cummalitive(data)

In [419]:
with open('gisaid_variant_data.json', 'w') as f:
    json.dump(final, f)
    
with open('cummulative_ggisaid_variant_data.json', 'w') as f:
    json.dump(final2, f)