# Příprava dat VAERS.
Načtení základních dat VAERS, jejich prvotní zpracování a uložení do mezisouborů k dalšímu zpracování.

In [8]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [9]:
# nastavim si zakladni promenny, odkud a jaky data budu cist
current_path = os.getcwd()
path = os.path.join(current_path, 'data')
files = os.listdir(path)
vaxfiles = [f for f in files if f[-7:] == 'VAX.csv']
datafiles = [f for f in files if f[-8:] == 'DATA.csv']

In [10]:
# nactu data z VAX souboru a vytvorim jeden obrovskej df
vaxes = pd.DataFrame()

for f in vaxfiles:
    fname = os.path.join(current_path, os.path.join('data', f))
    data = pd.read_csv(fname, encoding='cp850')
    data['fileyear'] = int(f[:4])
    vaxes = vaxes.append(data)

In [11]:
# nactu data z DATA souboru a vytvorim jeden obrovskej df
datas = pd.DataFrame()

for f in datafiles:
    fname = os.path.join(current_path, os.path.join('data', f))
    data = pd.read_csv(fname, encoding='cp850', low_memory=False)
    data['fileyear'] = int(f[:4])
    data[['RECVDATE', 'VAX_DATE']] = data[['RECVDATE', 'VAX_DATE']].apply(pd.to_datetime)
    data['AGE_YRS'] = data['AGE_YRS'].apply(pd.to_numeric)
    datas = datas.append(data)

In [12]:
# sloucim oba df do jednoho, abych mel ke kazdymu pripadu odpovidajici vax data
bigdata = pd.merge(left = vaxes,
                   right = datas,
                   left_on = 'VAERS_ID',
                   right_on= 'VAERS_ID',
                   how = 'left')[['VAERS_ID', 'RECVDATE', 'AGE_YRS', 'SEX', 'DIED', 'VAX_DATE', 'NUMDAYS',
                                    'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'VAX_NAME', 'fileyear_y']].rename(columns={'fileyear_y' : 'FILEYEAR'})
bigdata['VAXYEAR'] = pd.DatetimeIndex(bigdata['VAX_DATE']).year

In [13]:
# nactu rozdelovaci data, kde jsou rozdeleny vaxs podle nemoci
vaxd = pd.read_excel('vaxgroups_with_disease.xlsx')
b = vaxd['DISEASE'].apply(lambda x : pd.Series(x.split(', '))).stack()
b_reset = b.reset_index()
vaxd_reset = vaxd.reset_index()
df_temp = pd.merge(b_reset,
                    vaxd_reset,
                    left_on = 'level_0',
                    right_on = 'index' )
disease_by_vax = df_temp[['VAX_TYPE', 0]]
disease_by_vax = disease_by_vax.rename(columns={0: "DISEASE"})

In [14]:
# export dat do mezisouboru
bigdata.to_feather('bigdata.feather')
disease_by_vax.to_feather('disease_by_vax.feather')
vaxes.reset_index().to_feather('vaxes.feather')