In [None]:
pip install -r ./requirements.txt

In [2]:
import datetime
import json
import os
import pandas as pd
import shutil

In [3]:
AGES = {
    "0-5 years": [0, 6],
    "6-14 years": [6, 15],
    "15-24 years": [15, 25],
    "25-65 years": [25, 66],
    "65+ years": [66, 999],
}

CSV_PATH = "../csv"
CSV_EXTRACTED_PATH = f"{CSV_PATH}/extracted"
CSV_MERGED_PATH = f"{CSV_PATH}/merged"

DATA_PATH = f"../data"
DATA_JSON_PATH = f"{DATA_PATH}/data.json"

MONTHS = [
    "Jan",
    "Feb",
    "Mar",
    "Apr",
    "May",
    "June",
    "July",
    "Aug",
    "Sept",
    "Oct",
    "Nov",
    "Dec",
]

SEXES = {"female": "F", "male": "M"}

TYPES = {
    "DATA": {
        "id": "VAERS_ID",
        "date": "RECVDATE",
        "state": "STATE",
        "age": "AGE_YRS",
        "sex": "SEX",
        "died": "DIED",
        "hospital": "HOSPITAL",
        "disable": "DISABLE",
    },
    "SYMPTOMS": {
        "id": "VAERS_ID",
    },
    "VAX": {"id": "VAERS_ID", "type": "VAX_TYPE", "manufacturer": "VAX_MANU"},
}

YEARS = [str(i) for i in range(1990, int(datetime.date.today().year) + 1)]
YEARS.reverse()

In [4]:
shutil.rmtree(CSV_EXTRACTED_PATH)
shutil.rmtree(CSV_MERGED_PATH)

In [None]:
!bash ../scripts/unzip.sh

In [6]:
os.mkdir(CSV_MERGED_PATH)

In [7]:
for type in TYPES.keys():
    df = pd.DataFrame()

    for file in os.listdir(CSV_EXTRACTED_PATH):
        if file.endswith(f"{type}.csv"):
            file_df = pd.read_csv(
                f"{CSV_EXTRACTED_PATH}/{file}",
                encoding="latin1",
                engine="python",
            )

            file_df = file_df[TYPES[type].values()]

            if df.empty:
                df = file_df
            else:
                df = pd.concat([df, file_df])

    df.to_csv(
        f"{CSV_MERGED_PATH}/{type}.csv",
        encoding="latin1",
        index=False,
        sep=",",
    )

In [8]:
df_data = pd.read_csv(
    f"{CSV_MERGED_PATH}/DATA.csv",
    encoding="latin1",
    engine="python",
    sep=",",
)
df_symptoms = pd.read_csv(
    f"{CSV_MERGED_PATH}/SYMPTOMS.csv",
    encoding="latin1",
    engine="python",
    sep=",",
)
df_vax = pd.read_csv(
    f"{CSV_MERGED_PATH}/VAX.csv",
    encoding="latin1",
    engine="python",
    sep=",",
)

In [9]:
data_id_column = TYPES["DATA"]["id"]


def filter_data_vaccine(df_data, df_vax, vaccine):
    with_vaccine = df_vax[df_vax[TYPES["VAX"]["type"]].eq(vaccine)]
    return df_data[
        df_data[data_id_column].isin((with_vaccine[data_id_column].tolist()))
    ]

In [10]:
def filter_data_year(df, year):
    return df[df[TYPES["DATA"]["date"]].str.contains(year)]

In [11]:
vax_id_column = TYPES["VAX"]["id"]


def filter_vax_data(df_data, df_vax):
    return df_vax[df_vax[data_id_column].isin((df_data[vax_id_column].tolist()))]

In [12]:
data_age_column = TYPES["DATA"]["age"]


def count_data_age(df, age_range):
    if age_range is None:
        return len(df[df[data_age_column].isna()])
    return len(
        df[df[data_age_column].between(age_range[0], age_range[1], inclusive="left")]
    )

In [13]:
def count_data_died(df, died):
    try:
        return int(df[TYPES["DATA"]["died"]].value_counts()[died])
    except:
        return 0

In [14]:
def count_data_disable(df, disable):
    try:
        return int(df[TYPES["DATA"]["disable"]].value_counts()[disable])
    except:
        return 0

In [15]:
def count_data_hospital(df, hospital):
    try:
        return int(df[TYPES["DATA"]["hospital"]].value_counts()[hospital])
    except:
        return 0

In [16]:
def count_data_sex(df, sex):
    try:
        return int(df[TYPES["DATA"]["sex"]].value_counts()[sex])
    except:
        return 0

In [17]:
def count_data_total_monthly(df, month):
    return len(
        df[
            df[TYPES["DATA"]["date"]]
            .str[:2]
            .eq(str(month) if month > 9 else f"0{str(month)}")
        ]
    )

In [18]:
def count_data_total_yearly(df, year):
    return len(df[df[TYPES["DATA"]["date"]].str.contains(year)])

In [19]:
def count_total(df):
    return len(df)

In [20]:
def group_vax_manufacturer(df):
    group = df.groupby([TYPES["VAX"]["manufacturer"]]).size()
    return zip(group.index, group)

In [21]:
def group_vax_vaccine(df):
    group = df.groupby([TYPES["VAX"]["type"]]).size()
    return zip(group.index, group)

In [22]:
def analyze_data_ages(df):
    results = {}

    total = count_total(df)

    unknown_total = count_data_age(df, None)

    results["unknown"] = unknown_total

    data = []

    for age in AGES.keys():
        age_range = AGES[age]
        age_total = count_data_age(df, age_range)

        data.append(
            {
                "label": age,
                "percentage": (age_total / (total - unknown_total)) * 100
                if total > 0 and (total - unknown_total) > 0
                else 0,
                "total": age_total,
            }
        )

    results["data"] = data

    return results

In [23]:
def analyze_data_died(df):
    results = {}

    total = count_total(df)

    died_total = count_data_died(df, "Y")
    other_total = total - died_total

    results["chart"] = [{"label": "Total", "died": died_total, "other": other_total}]

    results["data"] = [
        {
            "label": "Died",
            "percentage": (died_total / total) * 100 if total > 0 else 0,
            "total": died_total,
        },
        {
            "label": "Other",
            "percentage": (other_total / total) * 100 if total > 0 else 0,
            "total": other_total,
        },
    ]

    return results

In [24]:
def analyze_data_disabled(df):
    results = {}

    total = count_total(df)

    disabled_total = count_data_disable(df, "Y")
    other_total = total - disabled_total

    results["chart"] = [
        {"label": "Total", "disabled": disabled_total, "other": other_total}
    ]

    results["Data"] = [
        {
            "label": "Disabled",
            "percentage": (disabled_total / total) * 100 if total > 0 else 0,
            "total": disabled_total,
        },
        {
            "label": "Other",
            "percentage": (other_total / total) * 100 if total > 0 else 0,
            "total": other_total,
        },
    ]

    return results

In [25]:
def analyze_data_hospital(df):
    results = {}

    total = count_total(df)

    hospital_total = count_data_hospital(df, "Y")
    other_total = total - hospital_total

    results["chart"] = [
        {"label": "Total", "hospital": hospital_total, "other": other_total}
    ]

    results["data"] = [
        {
            "label": "Hospital",
            "percentage": (hospital_total / total) * 100 if total > 0 else 0,
            "total": hospital_total,
        },
        {
            "label": "Other",
            "percentage": (other_total / total) * 100 if total > 0 else 0,
            "total": other_total,
        },
    ]

    return results

In [26]:
def analyze_data_sexes(df):
    results = {}

    total = count_total(df)

    unknown_total = count_data_sex(df, "U")

    results["unknown"] = unknown_total

    chart = [{"label": "Total"}]

    data = []

    for sex in SEXES.keys():
        sex_total = count_data_sex(df, SEXES[sex])

        chart_update = {}
        chart_update[sex] = sex_total
        chart[0] = {**chart[0], **chart_update}
        data.append(
            {
                "label": sex,
                "percentage": (sex_total / (total - unknown_total)) * 100
                if total > 0 and (total - unknown_total) > 0
                else 0,
                "total": sex_total,
            }
        )

    results["chart"] = chart

    results["data"] = data

    return results

In [27]:
def analyze_data_totals(df, type):
    results = {}

    total = count_total(df)

    data = []

    if type == "years":
        axis = YEARS
        count_data = count_data_total_yearly
    else:
        axis = MONTHS
        count_data = count_data_total_monthly

    for i, a in enumerate(axis):
        a_total = count_data(df, a if type == "years" else i + 1)

        data.append(
            {
                "label": a,
                "percentage": (a_total / total) * 100 if total > 0 else 0,
                "total": a_total,
            }
        )

    if type == "years":
        data.reverse()

    results["data"] = data

    return results

In [28]:
def analyze_total(df):
    return count_total(df)

In [29]:
def analyze_vax_manufacturers(df_data, df_vax):
    results = {}

    total = count_total(df_data)

    group = group_vax_manufacturer(df_vax)

    data = []

    for group_name, group_total in group:
        data.append(
            {
                "name": group_name,
                "children": [
                    {
                        "label": group_name,
                        "name": "Reports",
                        "percentage": (group_total / total) * 100 if total > 0 else 0,
                        "total": group_total,
                    }
                ],
            }
        )

    results["data"] = data

    return results

In [30]:
def analyze_vax_vaccines(df_data, df_vax):
    results = {}

    total = count_total(df_data)

    group = group_vax_vaccine(df_vax)

    data = []

    for group_name, group_total in group:
        data.append(
            {
                "name": group_name,
                "children": [
                    {
                        "label": group_name,
                        "name": "Reports",
                        "percentage": (group_total / total) * 100 if total > 0 else 0,
                        "total": group_total,
                    }
                ],
            }
        )

    results["data"] = data

    return results

In [31]:
vax_type_column = TYPES["VAX"]["type"]


def list_vax_vaccines(df):
    return sorted(df[vax_type_column].drop_duplicates().to_list())

In [40]:
def analysis(vaccine, year):
    results = {}

    if year:
        df_data_filtered = filter_data_year(df_data, year)
    else:
        df_data_filtered = df_data

    if vaccine:
        df_data_filtered = filter_data_vaccine(df_data_filtered, df_vax, vaccine)

    df_vax_filtered = filter_vax_data(df_data_filtered, df_vax)

    results["total"] = analyze_total(df_data_filtered)

    data = {}

    data["ages"] = analyze_data_ages(df_data_filtered)

    data["died"] = analyze_data_died(df_data_filtered)

    data["disabled"] = analyze_data_disabled(df_data_filtered)

    data["hospital"] = analyze_data_hospital(df_data_filtered)

    data["sexes"] = analyze_data_sexes(df_data_filtered)

    data["totals"] = analyze_data_totals(
        df_data_filtered, "months" if year else "years"
    )

    results["data"] = data

    vax = {}

    vax["manufacturers"] = analyze_vax_manufacturers(df_data_filtered, df_vax_filtered)

    vax["vaccines"] = analyze_vax_vaccines(df_data_filtered, df_vax_filtered)

    results["vax"] = vax

    vaccines = list_vax_vaccines(df_vax_filtered)

    results["vaccines"] = vaccines

    return results

In [41]:
data = json.load(open(DATA_JSON_PATH))

In [None]:
results = {}

results["years"] = YEARS

results["all"] = analysis(None, None)

for year in YEARS:
    if year not in data.keys() or year in [YEARS[0], YEARS[1]]:
        results[year] = analysis(None, year)
        print(year)

        for vaccine in list_vax_vaccines(df_vax):
            results[f"{vaccine}{year}"] = analysis(vaccine, year)
            print(f"{vaccine}{year}")

In [43]:
with open(DATA_JSON_PATH, "w") as w:
    w.write(json.dumps(results))

In [44]:
with open("../data/all.json", "w") as w:
    w.write(json.dumps({"all": analysis(None, None)}))