In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import pandas as pd
import tensorflow as tf


# Import and clean df

In [2]:
# Import cleaned VAERS dataset
vaers_df = pd.read_csv('VAERSDATA.csv', low_memory=False)
vaers_df.head()

Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,0,916600,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,1,916601,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,2,916602,01/01/2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,3,916603,01/01/2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,4,916604,01/01/2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))


In [3]:
# Check vaers_df column types
vaers_df.dtypes

Unnamed: 0           int64
VAERS_ID             int64
RECVDATE            object
STATE               object
AGE_YRS            float64
CAGE_YR            float64
CAGE_MO            float64
SEX                 object
RPT_DATE            object
SYMPTOM_TEXT        object
DIED                object
DATEDIED            object
L_THREAT            object
ER_VISIT            object
HOSPITAL            object
HOSPDAYS           float64
X_STAY              object
DISABLE             object
RECOVD              object
VAX_DATE            object
ONSET_DATE          object
NUMDAYS            float64
LAB_DATA            object
V_ADMINBY           object
V_FUNDBY            object
OTHER_MEDS          object
CUR_ILL             object
HISTORY             object
PRIOR_VAX           object
SPLTTYPE            object
FORM_VERS            int64
TODAYS_DATE         object
BIRTH_DEFECT        object
OFC_VISIT           object
ER_ED_VISIT         object
ALLERGIES           object
SYMPTOM1            object
S

In [4]:
# change dates(objects)columns to datetime
#vaers_df['RECVDATE'] = pd.to_datetime(vaers_df['RECVDATE'])
#vaers_df['RPT_DATE'] = pd.to_datetime(vaers_df['RPT_DATE'])
#vaers_df['DATEDIED'] = pd.to_datetime(vaers_df['DATEDIED'])
#vaers_df['VAX_DATE'] = pd.to_datetime(vaers_df['VAX_DATE'])
#vaers_df['ONSET_DATE'] = pd.to_datetime(vaers_df['ONSET_DATE'])
#vaers_df['TODAYS_DATE'] = pd.to_datetime(vaers_df['TODAYS_DATE'])

#vaers_df.dtypes

In [5]:
# Categorical variable list
vaers_cat = vaers_df.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

['RECVDATE',
 'STATE',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'LAB_DATA',
 'V_ADMINBY',
 'V_FUNDBY',
 'OTHER_MEDS',
 'CUR_ILL',
 'HISTORY',
 'PRIOR_VAX',
 'SPLTTYPE',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'ALLERGIES',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [6]:
# Unique values in each column
vaers_df[vaers_cat].nunique()

RECVDATE              70
STATE                 60
SEX                    3
RPT_DATE              20
SYMPTOM_TEXT       27195
DIED                   1
DATEDIED              95
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
X_STAY                 1
DISABLE                1
RECOVD                 3
VAX_DATE             278
ONSET_DATE           206
LAB_DATA            7726
V_ADMINBY              9
V_FUNDBY               4
OTHER_MEDS         11816
CUR_ILL             3518
HISTORY             9515
PRIOR_VAX           1033
SPLTTYPE            5895
TODAYS_DATE          104
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
ALLERGIES           6245
SYMPTOM1            2301
SYMPTOM2            2220
SYMPTOM3            2017
SYMPTOM4            1833
SYMPTOM5            1617
VAX_TYPE              40
VAX_MANU              14
VAX_LOT             1887
VAX_DOSE_SERIES        8
VAX_ROUTE              7
VAX_SITE               9
VAX_NAME              63


In [7]:
# DIED value counts
counts = vaers_df.DIED.value_counts()
counts

Y    2575
Name: DIED, dtype: int64

In [8]:
# LAB DATA value counts
counts = vaers_df.LAB_DATA.value_counts()
counts

None                                                                                                      2684
none                                                                                                      1557
no                                                                                                         222
NONE                                                                                                       192
None.                                                                                                      171
                                                                                                          ... 
observation and VS                                                                                           1
Test Date: 20201230; Test Name: COVID; Test Result: Positive                                                 1
01/04/2021 positive COVID-19 test                                                                            1
N

In [9]:
# OTHER MEDS value counts
counts = vaers_df.OTHER_MEDS.value_counts()
counts

None                                                             1866
none                                                              875
unknown                                                           360
Unknown                                                           359
No                                                                131
                                                                 ... 
birth control pills vit C                                           1
Bupropion, methylphenidate, oral contraceptives                     1
Symbicort, Xanax, Albuterol, Flomax, Polyethylene glycol            1
Citirizin (allergy medication), Probiotic, pre-natal vitamins       1
Vyvanse.                                                            1
Name: OTHER_MEDS, Length: 11816, dtype: int64

In [10]:
# CUR_ILL value counts
counts = vaers_df.CUR_ILL.value_counts()
counts

None                                                                       6322
none                                                                       3455
No                                                                          654
no                                                                          522
NONE                                                                        365
                                                                           ... 
C-diff                                                                        1
DM, HTN, anemia, gout, BPH, atrial fib, heart failure, CAD, CKD,              1
- covid-19                                                                    1
hypertension, diabetes mellitus type 2, hyperlipidemia, thyroid disease       1
OA of the knee                                                                1
Name: CUR_ILL, Length: 3518, dtype: int64

In [11]:
# HISTORY value counts
counts = vaers_df.HISTORY.value_counts()
counts

None                                                                                                                                                                                               3153
none                                                                                                                                                                                               1525
Comments: List of non-encoded Patient Relevant History: Patient Other Relevant History 1: None                                                                                                      676
Medical History/Concurrent Conditions: No adverse event (No reported medical history)                                                                                                               375
No                                                                                                                                                                                                  339


In [12]:
# PRIOR_VAX value counts
counts = vaers_df.PRIOR_VAX.value_counts()
counts

Flu                                                                                                                                 18
unknown                                                                                                                             13
Flu Vaccine                                                                                                                         10
COVID19 (Moderna) on 1/26/2021                                                                                                       9
yes                                                                                                                                  8
                                                                                                                                    ..
reports not feeling well after getting flu shot earlier this year                                                                    1
2020 Flu shot, same symptoms and resolve time, without 

In [13]:
#SPLTTYPE value counts
counts = vaers_df.SPLTTYPE.value_counts()
counts

USMODERNATX, INC.MOD20210    1291
vsafe                         514
VSAFE                          73
USMODERNATX, INC.MOD20200      49
USGLAXOSMITHKLINEUS2020GS      29
                             ... 
USPFIZER INC2021034604          1
USPFIZER INC2021145598          1
USPFIZER INC2021145204          1
USPFIZER INC2020521806          1
USPFIZER INC2021147928          1
Name: SPLTTYPE, Length: 5895, dtype: int64

In [14]:
#ALLERGIES value counts
counts = vaers_df.ALLERGIES.value_counts()
counts

None                          3369
none                          1619
NKA                            804
NKDA                           769
No                             401
                              ... 
Shellfish and hayfever           1
sulfa kiwi fiorcet banana        1
Wellbutrin , tree nuts           1
Bees, coconut, latex             1
Latex Peach Banana Avocado       1
Name: ALLERGIES, Length: 6245, dtype: int64

In [15]:
#SYMPTOM1 value counts
counts = vaers_df.SYMPTOM1.value_counts()
counts

Chills                     2358
Arthralgia                 1516
Headache                   1197
Injection site erythema    1176
Dizziness                  1175
                           ... 
Giant cell arteritis          1
Peripheral nerve injury       1
Base excess increased         1
Internal haemorrhage          1
IVth nerve paralysis          1
Name: SYMPTOM1, Length: 2301, dtype: int64

In [16]:
#SYMPTOM2 value counts
counts = vaers_df.SYMPTOM2.value_counts()
counts

Headache                           1683
Fatigue                            1301
Chills                             1118
Dizziness                           835
Pyrexia                             825
                                   ... 
Dissociative disorder                 1
Blood uric acid                       1
Duodenal ulcer                        1
Glucose tolerance test abnormal       1
Rheumatoid factor                     1
Name: SYMPTOM2, Length: 2220, dtype: int64

In [17]:
#VAX_LOT value counts
counts = vaers_df.VAX_LOT.value_counts()
counts

039K20A     1375
011J20A     1260
EK9231      1132
EK5730      1072
EH9899      1047
            ... 
039k0a         1
041L201A       1
DA9R2          1
039K209        1
EK4736         1
Name: VAX_LOT, Length: 1887, dtype: int64

In [18]:
# Drop LAB_DATA, OTHER MEDS, CUR_ILL, HISTORY, PRIOR_VAX, ALLERGIES - unless we use NLP, those columns can't be hot encoded
vaers_clean = vaers_df.drop(columns = ['LAB_DATA', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES'])

In [19]:
# re-count Unique values in each column in new cleaned df
vaers_clean.nunique()

Unnamed: 0         40261
VAERS_ID           27955
RECVDATE              70
STATE                 60
AGE_YRS              118
CAGE_YR              105
CAGE_MO                7
SEX                    3
RPT_DATE              20
SYMPTOM_TEXT       27195
DIED                   1
DATEDIED              95
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
HOSPDAYS              33
X_STAY                 1
DISABLE                1
RECOVD                 3
VAX_DATE             278
ONSET_DATE           206
NUMDAYS              115
V_ADMINBY              9
V_FUNDBY               4
SPLTTYPE            5895
FORM_VERS              2
TODAYS_DATE          104
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
SYMPTOM1            2301
SYMPTOMVERSION1        2
SYMPTOM2            2220
SYMPTOMVERSION2        2
SYMPTOM3            2017
SYMPTOMVERSION3        2
SYMPTOM4            1833
SYMPTOMVERSION4        2
SYMPTOM5            1617
SYMPTOMVERSION5        2


# One Hot Encoder

In [28]:
# replace missing data with Simple Inputer
imp = SimpleImputer(strategy='constant')

vaers_imp = pd.DataFrame(imp.fit_transform(vaers_clean))
vaers_imp.columns = vaers_clean.columns
vaers_imp

Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,0,916600,01/01/2021,TX,33.0,33.0,missing_value,F,missing_value,Right side of epiglottis swelled up and hinder...,...,missing_value,missing_value,missing_value,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,1,916601,01/01/2021,CA,73.0,73.0,missing_value,F,missing_value,Approximately 30 min post vaccination administ...,...,missing_value,missing_value,missing_value,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,2,916602,01/01/2021,WA,23.0,23.0,missing_value,F,missing_value,"About 15 minutes after receiving the vaccine, ...",...,23.1,missing_value,missing_value,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,3,916603,01/01/2021,WA,58.0,58.0,missing_value,F,missing_value,"extreme fatigue, dizziness,. could not lift my...",...,missing_value,missing_value,missing_value,COVID19,MODERNA,unknown,UNK,missing_value,missing_value,COVID19 (COVID19 (MODERNA))
4,4,916604,01/01/2021,TX,47.0,47.0,missing_value,F,missing_value,"Injection site swelling, redness, warm to the ...",...,23.1,missing_value,missing_value,COVID19,MODERNA,missing_value,1,IM,LA,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40256,40256,1092595,03/11/2021,TX,70.0,70.0,missing_value,F,missing_value,A high risk employee received 1st vaccine dose...,...,missing_value,missing_value,missing_value,COVID19,MODERNA,025J20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
40257,40257,1092651,03/11/2021,CA,77.0,77.0,missing_value,M,missing_value,DEATH,...,missing_value,missing_value,missing_value,COVID19,MODERNA,0124214,1,IM,LA,COVID19 (COVID19 (MODERNA))
40258,40258,1092653,03/11/2021,CA,88.0,88.0,missing_value,F,missing_value,Patient received the vaccine on the evening of...,...,missing_value,missing_value,missing_value,COVID19,MODERNA,unknown,2,SYR,AR,COVID19 (COVID19 (MODERNA))
40259,40259,1092737,03/11/2021,ME,69.0,69.0,missing_value,M,missing_value,"Sudden death. Alone at home, found on floor 4 ...",...,missing_value,missing_value,missing_value,COVID19,MODERNA,missing_value,1,UN,AR,COVID19 (COVID19 (MODERNA))


In [29]:
# Recriate the vares_clean categorical list
vaers_cat = vaers_imp.dtypes[vaers_imp.dtypes == 'object'].index.tolist()
vaers_cat

['Unnamed: 0',
 'VAERS_ID',
 'RECVDATE',
 'STATE',
 'AGE_YRS',
 'CAGE_YR',
 'CAGE_MO',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'HOSPDAYS',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'NUMDAYS',
 'V_ADMINBY',
 'V_FUNDBY',
 'SPLTTYPE',
 'FORM_VERS',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'SYMPTOM1',
 'SYMPTOMVERSION1',
 'SYMPTOM2',
 'SYMPTOMVERSION2',
 'SYMPTOM3',
 'SYMPTOMVERSION3',
 'SYMPTOM4',
 'SYMPTOMVERSION4',
 'SYMPTOM5',
 'SYMPTOMVERSION5',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [22]:
# OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

#Fit and transform the OneHotEncoder
vaers_enc_df = pd.DataFrame(enc.fit_transform(vaers_clean[vaers_cat]))

#Add the encoded variable names to the DF
vaers_enc_df.columns = enc.get_feature_names(vaers_cat)
vaers_enc_df.head()

ValueError: Input contains NaN

In [None]:
# Merge with vaers_clean and drop original cat. list
vaers_df = vaers_clean.merge(vaers_enc_df, left_index=True, right_index=True)
vaers_df = vaers_df.drop(vaers_cat,1)


In [None]:
# clean vaers_df dropping Unamed first columns (= index) and DIED_None
vaers_df = vaers_df.drop(columns = 'Unnamed: 0')


# Split Data

In [None]:
# Split data into features and targets
y = vaers_df['DIED_Y']
X = vaers_df.drop(columns = 'DIED_None')

In [None]:
X

In [None]:
# Split into test and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)