# COVID Forecasting Review Data Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt

In [2]:
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = (20,8)

## Load and clean data

In [3]:
data = pd.read_csv("../data/finaldata.csv", keep_default_na=False)

In [4]:
metadata = pd.read_csv("../data/metadata.csv", keep_default_na=True, parse_dates=["date_published"])

In [5]:
manual_journal_fixes = {
    "Chaos, Solitons & Fractals": "Chaos Solitons & Fractals",
    "PLOS Biology": "PLoS Biology",
    "PLOS ONE": "PLoS ONE",
    "PLOS Computational Biology": "PLoS Computational Biology",
    "Proceedings of the Royal Society A: Mathematical, Physical and Engineering Sciences": "Proceedings of the Royal Society A Mathematical Physical and Engineering Sciences",
    "Journal of Big Data": "Journal Of Big Data",
    "Physica A: Statistical Mechanics and its Applications": "Physica A Statistical Mechanics and its Applications",
    "Science of The Total Environment": "The Science of The Total Environment",
}

In [6]:
journal_metadata = pd.read_csv("../data/journal_metadata.csv")
journal_subjects_lookup = {row[1].title: row[1].subjects for row in journal_metadata.iterrows()}
for (k,v) in manual_journal_fixes.items():
    journal_subjects_lookup[k] = journal_subjects_lookup[v]
metadata["journal_subjects"] = metadata.journal.map(lambda x: journal_subjects_lookup.get(x, ""))
metadata.loc[metadata.paper_type != "journal-article", "journal_subjects"] = "preprint"

## Helper Functions

In [7]:
def category_occurances(col, remove=[], corrections={}, sep=", "):
    col = list(col)
    nrows = len(col)
    col = [x.strip() for s in col for x in str(s).split(sep)]
    col = [corrections[x] if x in corrections else x for x in col]
    col = [x if isinstance(x, list) else [x] for x in col]
    col = [x for y in col for x in y]
    unq_vals = list(set(col))
    unq_vals = [x for x in unq_vals if not x in remove]
    col_occ_dict = {x: col.count(x) for x in unq_vals}
    col_occ = pd.DataFrame(col_occ_dict.items())
    col_occ = col_occ.rename(columns={0: "value", 1: "occurances"})
    col_occ = col_occ.sort_values(by="occurances", ascending=False, ignore_index=True)
    col_occ["pct"] = col_occ["occurances"] / nrows
    return col_occ

In [8]:
def category_occurances_flat(col):
    col = list(col)
    unq_vals = list(set(col))
    dist = [{"value": v, "occurances": col.count(v)} for v in unq_vals]
    dist_df = pd.DataFrame(dist)
    dist_df = dist_df.sort_values(by="occurances", ascending=False, ignore_index=True)
    dist_df["pct"] = dist_df["occurances"] / len(col)
    return dist_df

In [9]:
def numcategory_occurances(col, ignore=[]):
    col = list(col)
    nrows = len(col)
    col = [r.split(", ") for r in col]
    col = [[x for x in r if x not in ignore] for r in col]
    lens = [len(r) for r in col]
    maxlen = max(lens)
    dist = [{"value": i, "occurances": lens.count(i)} for i in range(maxlen+1)]
    dist_df = pd.DataFrame(dist)
    dist_df["pct"] = dist_df["occurances"] / nrows
    return dist_df

## Analysis

In [10]:
def compute_results(d, m, save=None):
    summary = pd.DataFrame([
        {"metric": "total papers", "value": len(m)},
        {"metric": "journal articles", "value": sum(m["paper_type"] == "journal-article")},
        {"metric": "preprints", "value": sum(m["paper_type"] != "journal-article")},
        {"metric": "start date", "value": m["date_published"].min().date()},
        {"metric": "end date", "value": m["date_published"].max().date()},
    ])

    d_evaluable = d[d.performance_eval != "not evaluable"]
    d_evaluated = d[d.performance_eval == "metric-based"]

    results = {
        "summary": summary,
        "data_cats": category_occurances(d["data_cat"], corrections={"deaths?": "deaths"}),
        "data_cat_counts": numcategory_occurances(d["data_cat"], ignore=[]),
        "method_cats": category_occurances(d["method_cat"]),
        "goal_cats": category_occurances(d["objective_cat"]),
        "region_level": category_occurances(d["region_level"], corrections={"county": "county or smaller", "city": "county or smaller", "town": "county or smaller", "zipcode": "county or smaller"}),
        "region_level_full": category_occurances(d["region_level"]),
        "target_cats": category_occurances(d["target"]),
        "eval_cats": category_occurances(d["performance_eval"]),
        "eval_cats_evaluable": category_occurances(d_evaluable["performance_eval_sub"]),
        "eval_cats_evaluated": category_occurances(d_evaluated["performance_eval_sub"]),
        "eval_subcats": category_occurances(d["performance_eval_sub"]),
        "eval_metrics": category_occurances(d_evaluated["metrics"]),
        "eval_metric_counts": numcategory_occurances(d_evaluated["metrics"], ignore=["NA"]),
        "uncertainty_cats": category_occurances(d["uncertainty"], corrections={"CIs/Pis": "CIs/PIs"}),
        "uncertainty_subcats": category_occurances(d["uncertainty_sub"]),
        "limitation_cats": category_occurances(d["limitations"]),
        "limitation_cat_counts": numcategory_occurances(d["limitations"], ignore=["none"]),
        "journals": category_occurances_flat(m["journal"]),
        "journal_subjects": category_occurances(m["journal_subjects"], sep="; "),
    }
    
    if save is not None:
        with pd.ExcelWriter(save) as writer:
            for sheet_name, df in results.items():
                df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    return results

In [11]:
compute_results(data, metadata, save="../results/results-main.xlsx");