In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
import pandas as pd
import tensorflow as tf


# Import and clean df

In [2]:
# Import cleaned VAERS dataset
vaers_df = pd.read_csv('VAERSDATA.csv', low_memory=False)
vaers_df.head()

Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,0,916600,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,1,916601,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,2,916602,01/01/2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,3,916603,01/01/2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,4,916604,01/01/2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))


In [3]:
# Check vaers_df column types
vaers_df.dtypes

Unnamed: 0           int64
VAERS_ID             int64
RECVDATE            object
STATE               object
AGE_YRS            float64
CAGE_YR            float64
CAGE_MO            float64
SEX                 object
RPT_DATE            object
SYMPTOM_TEXT        object
DIED                object
DATEDIED            object
L_THREAT            object
ER_VISIT            object
HOSPITAL            object
HOSPDAYS           float64
X_STAY              object
DISABLE             object
RECOVD              object
VAX_DATE            object
ONSET_DATE          object
NUMDAYS            float64
LAB_DATA            object
V_ADMINBY           object
V_FUNDBY            object
OTHER_MEDS          object
CUR_ILL             object
HISTORY             object
PRIOR_VAX           object
SPLTTYPE            object
FORM_VERS            int64
TODAYS_DATE         object
BIRTH_DEFECT        object
OFC_VISIT           object
ER_ED_VISIT         object
ALLERGIES           object
SYMPTOM1            object
S

In [4]:
# change dates(objects)columns to datetime
#vaers_df['RECVDATE'] = pd.to_datetime(vaers_df['RECVDATE'])
#vaers_df['RPT_DATE'] = pd.to_datetime(vaers_df['RPT_DATE'])
#vaers_df['DATEDIED'] = pd.to_datetime(vaers_df['DATEDIED'])
#vaers_df['VAX_DATE'] = pd.to_datetime(vaers_df['VAX_DATE'])
#vaers_df['ONSET_DATE'] = pd.to_datetime(vaers_df['ONSET_DATE'])
#vaers_df['TODAYS_DATE'] = pd.to_datetime(vaers_df['TODAYS_DATE'])

#vaers_df.dtypes

Unnamed: 0                  int64
VAERS_ID                    int64
RECVDATE           datetime64[ns]
STATE                      object
AGE_YRS                   float64
CAGE_YR                   float64
CAGE_MO                   float64
SEX                        object
RPT_DATE           datetime64[ns]
SYMPTOM_TEXT               object
DIED                       object
DATEDIED           datetime64[ns]
L_THREAT                   object
ER_VISIT                   object
HOSPITAL                   object
HOSPDAYS                  float64
X_STAY                     object
DISABLE                    object
RECOVD                     object
VAX_DATE           datetime64[ns]
ONSET_DATE         datetime64[ns]
NUMDAYS                   float64
LAB_DATA                   object
V_ADMINBY                  object
V_FUNDBY                   object
OTHER_MEDS                 object
CUR_ILL                    object
HISTORY                    object
PRIOR_VAX                  object
SPLTTYPE      

In [5]:
# Categorical variable list
vaers_cat = vaers_df.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

['STATE',
 'SEX',
 'SYMPTOM_TEXT',
 'DIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'LAB_DATA',
 'V_ADMINBY',
 'V_FUNDBY',
 'OTHER_MEDS',
 'CUR_ILL',
 'HISTORY',
 'PRIOR_VAX',
 'SPLTTYPE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'ALLERGIES',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [6]:
# Unique values in each column
vaers_df[vaers_cat].nunique()

STATE                 60
SEX                    3
SYMPTOM_TEXT       27195
DIED                   1
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
X_STAY                 1
DISABLE                1
RECOVD                 3
LAB_DATA            7726
V_ADMINBY              9
V_FUNDBY               4
OTHER_MEDS         11816
CUR_ILL             3518
HISTORY             9515
PRIOR_VAX           1033
SPLTTYPE            5895
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
ALLERGIES           6245
SYMPTOM1            2301
SYMPTOM2            2220
SYMPTOM3            2017
SYMPTOM4            1833
SYMPTOM5            1617
VAX_TYPE              40
VAX_MANU              14
VAX_LOT             1887
VAX_DOSE_SERIES        8
VAX_ROUTE              7
VAX_SITE               9
VAX_NAME              63
dtype: int64

In [7]:
# DIED value counts
counts = vaers_df.DIED.value_counts()
counts

Y    2575
Name: DIED, dtype: int64

In [8]:
# LAB DATA value counts
counts = vaers_df.LAB_DATA.value_counts()
counts

None                                                                                                                                                                                                                                                                                                                         2684
none                                                                                                                                                                                                                                                                                                                         1557
no                                                                                                                                                                                                                                                                                                                            222
NONE                              

In [9]:
# OTHER MEDS value counts
counts = vaers_df.OTHER_MEDS.value_counts()
counts

None                                                                                                                                                                                                                                                1866
none                                                                                                                                                                                                                                                 875
unknown                                                                                                                                                                                                                                              360
Unknown                                                                                                                                                                                                                                              359
No  

In [10]:
# CUR_ILL value counts
counts = vaers_df.CUR_ILL.value_counts()
counts

None                                                       6322
none                                                       3455
No                                                          654
no                                                          522
NONE                                                        365
                                                           ... 
Parkinson?s, diabetes                                         1
Vaginal labor and delivery 2/5/21                             1
Antiphospholipid syndrome; Raynaud's disease; Sjogren's       1
Blood pressure high; Diabetes; Rheumatoid arthritis           1
Covid November 8th Flu shot December 4th                      1
Name: CUR_ILL, Length: 3518, dtype: int64

In [11]:
# HISTORY value counts
counts = vaers_df.HISTORY.value_counts()
counts

None                                                                                                                                                                  3153
none                                                                                                                                                                  1525
Comments: List of non-encoded Patient Relevant History: Patient Other Relevant History 1: None                                                                         676
Medical History/Concurrent Conditions: No adverse event (No reported medical history)                                                                                  375
No                                                                                                                                                                     339
                                                                                                                                                 

In [12]:
# PRIOR_VAX value counts
counts = vaers_df.PRIOR_VAX.value_counts()
counts

Flu                                                                                      18
unknown                                                                                  13
Flu Vaccine                                                                              10
COVID19 (Moderna) on 1/26/2021                                                            9
Dizziness, mild chest pain, mid spine pain, back of neck pain, headaches, body aches,     8
                                                                                         ..
Sore arm for 2-3 days after other vaccines (flu, shingles, hepatitis)                     1
Moderate injection site soreness with 1st COVID vaccine                                   1
reports flu like  symptoms p flu vaccine                                                  1
First COVID 19 vaccine had similar symptoms                                               1
Sore arm from seasonal flu vaccine, similar to this response                    

In [13]:
#SPLTTYPE value counts
counts = vaers_df.SPLTTYPE.value_counts()
counts

USMODERNATX, INC.MOD20210    1291
vsafe                         514
VSAFE                          73
USMODERNATX, INC.MOD20200      49
USGLAXOSMITHKLINEUS2020GS      29
                             ... 
USPFIZER INC2021081408          1
USPFIZER INC2021050989          1
USPFIZER INC2020516266          1
USPFIZER INC2021149307          1
USPFIZER INC2021133618          1
Name: SPLTTYPE, Length: 5895, dtype: int64

In [14]:
#ALLERGIES value counts
counts = vaers_df.ALLERGIES.value_counts()
counts

None                                                                                                                                                                                                                                                             3369
none                                                                                                                                                                                                                                                             1619
NKA                                                                                                                                                                                                                                                               804
NKDA                                                                                                                                                                                                                  

In [15]:
#SYMPTOM1 value counts
counts = vaers_df.SYMPTOM1.value_counts()
counts

Chills                            2358
Arthralgia                        1516
Headache                          1197
Injection site erythema           1176
Dizziness                         1175
                                  ... 
Herpes virus test                    1
Instillation site paraesthesia       1
Uterine haemorrhage                  1
Injection site erosion               1
Jaw fracture                         1
Name: SYMPTOM1, Length: 2301, dtype: int64

In [16]:
#SYMPTOM2 value counts
counts = vaers_df.SYMPTOM2.value_counts()
counts

Headache                           1683
Fatigue                            1301
Chills                             1118
Dizziness                           835
Pyrexia                             825
                                   ... 
Hydronephrosis                        1
Renal cyst                            1
Embolism                              1
Vascular imaging                      1
Glucose tolerance test abnormal       1
Name: SYMPTOM2, Length: 2220, dtype: int64

In [17]:
#VAX_LOT value counts
counts = vaers_df.VAX_LOT.value_counts()
counts

039K20A    1375
011J20A    1260
EK9231     1132
EK5730     1072
EH9899     1047
           ... 
2SM24         1
030M30A       1
013LZOA       1
O25J20A       1
EL3925        1
Name: VAX_LOT, Length: 1887, dtype: int64

In [18]:
# Drop LAB_DATA, OTHER MEDS, CUR_ILL, HISTORY, PRIOR_VAX, ALLERGIES - unless we use NLP, those columns can't be hot encoded
vaers_drop = vaers_df.drop(columns = ['LAB_DATA', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES'])

In [19]:
# Drop Unamed: ) because it is the same as the index
vaers_clean = vaers_drop.drop(columns = 'Unnamed: 0')

In [20]:
# re-count Unique values in each column in new cleaned df
vaers_clean.nunique()

VAERS_ID           27955
RECVDATE              70
STATE                 60
AGE_YRS              118
CAGE_YR              105
CAGE_MO                7
SEX                    3
RPT_DATE              20
SYMPTOM_TEXT       27195
DIED                   1
DATEDIED              95
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
HOSPDAYS              33
X_STAY                 1
DISABLE                1
RECOVD                 3
VAX_DATE             278
ONSET_DATE           206
NUMDAYS              115
V_ADMINBY              9
V_FUNDBY               4
SPLTTYPE            5895
FORM_VERS              2
TODAYS_DATE          104
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
SYMPTOM1            2301
SYMPTOMVERSION1        2
SYMPTOM2            2220
SYMPTOMVERSION2        2
SYMPTOM3            2017
SYMPTOMVERSION3        2
SYMPTOM4            1833
SYMPTOMVERSION4        2
SYMPTOM5            1617
SYMPTOMVERSION5        2
VAX_TYPE              40


In [23]:
#Search for columns with DIED in name to find the target column (y)
died_cols = [col for col in vaers_df.columns if 'DIED' in col]
died_cols

['DIED', 'DATEDIED']

In [24]:
# preapre target column (y)
vaers_clean['DIED'].fillna(value='N', inplace=True)
vaers_clean

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,916600,2021-01-01,TX,33.0,33.0,,F,NaT,Right side of epiglottis swelled up and hinder...,N,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,916601,2021-01-01,CA,73.0,73.0,,F,NaT,Approximately 30 min post vaccination administ...,N,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,916602,2021-01-01,WA,23.0,23.0,,F,NaT,"About 15 minutes after receiving the vaccine, ...",N,...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,916603,2021-01-01,WA,58.0,58.0,,F,NaT,"extreme fatigue, dizziness,. could not lift my...",N,...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,916604,2021-01-01,TX,47.0,47.0,,F,NaT,"Injection site swelling, redness, warm to the ...",N,...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40256,1092595,2021-03-11,TX,70.0,70.0,,F,NaT,A high risk employee received 1st vaccine dose...,Y,...,,,,COVID19,MODERNA,025J20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
40257,1092651,2021-03-11,CA,77.0,77.0,,M,NaT,DEATH,Y,...,,,,COVID19,MODERNA,0124214,1,IM,LA,COVID19 (COVID19 (MODERNA))
40258,1092653,2021-03-11,CA,88.0,88.0,,F,NaT,Patient received the vaccine on the evening of...,Y,...,,,,COVID19,MODERNA,unknown,2,SYR,AR,COVID19 (COVID19 (MODERNA))
40259,1092737,2021-03-11,ME,69.0,69.0,,M,NaT,"Sudden death. Alone at home, found on floor 4 ...",Y,...,,,,COVID19,MODERNA,,1,UN,AR,COVID19 (COVID19 (MODERNA))


# One Hot Encoder

In [25]:
# Use fillna to address NaN values
vaers_clean.fillna('None', inplace=True)
vaers_clean


Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,916600,2021-01-01,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,N,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,916601,2021-01-01,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,N,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,916602,2021-01-01,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",N,...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,916603,2021-01-01,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",N,...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,916604,2021-01-01,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",N,...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40256,1092595,2021-03-11,TX,70.0,70.0,,F,,A high risk employee received 1st vaccine dose...,Y,...,,,,COVID19,MODERNA,025J20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
40257,1092651,2021-03-11,CA,77.0,77.0,,M,,DEATH,Y,...,,,,COVID19,MODERNA,0124214,1,IM,LA,COVID19 (COVID19 (MODERNA))
40258,1092653,2021-03-11,CA,88.0,88.0,,F,,Patient received the vaccine on the evening of...,Y,...,,,,COVID19,MODERNA,unknown,2,SYR,AR,COVID19 (COVID19 (MODERNA))
40259,1092737,2021-03-11,ME,69.0,69.0,,M,,"Sudden death. Alone at home, found on floor 4 ...",Y,...,,,,COVID19,MODERNA,,1,UN,AR,COVID19 (COVID19 (MODERNA))


In [30]:
# Recriate the vares_clean categorical list
vaers_cat = vaers_clean.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

['STATE',
 'SEX',
 'SYMPTOM_TEXT',
 'DIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'V_ADMINBY',
 'V_FUNDBY',
 'SPLTTYPE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [31]:
# OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

#Fit and transform the OneHotEncoder
vaers_enc_df = pd.DataFrame(enc.fit_transform(vaers_clean[vaers_cat]))

#Add the encoded variable names to the DF
vaers_enc_df.columns = enc.get_feature_names(vaers_cat)
vaers_enc_df.head()

Unnamed: 0,STATE_AK,STATE_AL,STATE_AR,STATE_AS,STATE_AZ,STATE_CA,STATE_CO,STATE_CT,STATE_Ca,STATE_DC,...,VAX_NAME_TDAP (NO BRAND NAME),VAX_NAME_TETANUS TOXOID (NO BRAND NAME),VAX_NAME_TYPHOID LIVE ORAL TY21A (VIVOTIF),VAX_NAME_VACCINE NOT SPECIFIED (NO BRAND NAME),VAX_NAME_VACCINE NOT SPECIFIED (OTHER),VAX_NAME_VARICELLA (VARIVAX),VAX_NAME_YELLOW FEVER (YF-VAX),VAX_NAME_ZOSTER (NO BRAND NAME),VAX_NAME_ZOSTER (SHINGRIX),VAX_NAME_ZOSTER LIVE (ZOSTAVAX)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# Merge with vaers_clean and drop original cat. list
vaers_df = vaers_clean.merge(vaers_enc_df, left_index=True, right_index=True)
vaers_df = vaers_df.drop(vaers_cat,1)

# Split and Scale Data

In [None]:
# Split data into features and targets
y = vaers_df['DIED_Y']
X = vaers_df.drop(columns = 'DIED_Y')

In [None]:
# Counting values on y
y.value_counts()

In [None]:
# Split into test and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Logistic Regression

## Resampling the data

In [None]:
# Counting values in Y_train


In [None]:
#Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=78)
classifier

In [None]:
# Train the data
classifier.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = classifier.predict(X_test)
print(f' Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}')