In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
import pandas as pd
import tensorflow as tf


# Import and clean df

In [2]:
# Import cleaned VAERS dataset
vaers_df = pd.read_csv('VAERSDATA.csv', low_memory=False)
vaers_df.head()

Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,0,916600,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,1,916601,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,2,916602,01/01/2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,3,916603,01/01/2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,4,916604,01/01/2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))


In [3]:
# Check vaers_df column types
vaers_df.dtypes

Unnamed: 0           int64
VAERS_ID             int64
RECVDATE            object
STATE               object
AGE_YRS            float64
CAGE_YR            float64
CAGE_MO            float64
SEX                 object
RPT_DATE            object
SYMPTOM_TEXT        object
DIED                object
DATEDIED            object
L_THREAT            object
ER_VISIT            object
HOSPITAL            object
HOSPDAYS           float64
X_STAY              object
DISABLE             object
RECOVD              object
VAX_DATE            object
ONSET_DATE          object
NUMDAYS            float64
LAB_DATA            object
V_ADMINBY           object
V_FUNDBY            object
OTHER_MEDS          object
CUR_ILL             object
HISTORY             object
PRIOR_VAX           object
SPLTTYPE            object
FORM_VERS            int64
TODAYS_DATE         object
BIRTH_DEFECT        object
OFC_VISIT           object
ER_ED_VISIT         object
ALLERGIES           object
SYMPTOM1            object
S

In [4]:
# change dates(objects)columns to datetime
#vaers_df['RECVDATE'] = pd.to_datetime(vaers_df['RECVDATE'])
#vaers_df['RPT_DATE'] = pd.to_datetime(vaers_df['RPT_DATE'])
#vaers_df['DATEDIED'] = pd.to_datetime(vaers_df['DATEDIED'])
#vaers_df['VAX_DATE'] = pd.to_datetime(vaers_df['VAX_DATE'])
#vaers_df['ONSET_DATE'] = pd.to_datetime(vaers_df['ONSET_DATE'])
#vaers_df['TODAYS_DATE'] = pd.to_datetime(vaers_df['TODAYS_DATE'])

#vaers_df.dtypes

In [5]:
# Categorical variable list
vaers_cat = vaers_df.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

['RECVDATE',
 'STATE',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'LAB_DATA',
 'V_ADMINBY',
 'V_FUNDBY',
 'OTHER_MEDS',
 'CUR_ILL',
 'HISTORY',
 'PRIOR_VAX',
 'SPLTTYPE',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'ALLERGIES',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [6]:
# Unique values in each column
vaers_df[vaers_cat].nunique()

RECVDATE              70
STATE                 60
SEX                    3
RPT_DATE              20
SYMPTOM_TEXT       27195
DIED                   1
DATEDIED              95
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
X_STAY                 1
DISABLE                1
RECOVD                 3
VAX_DATE             278
ONSET_DATE           206
LAB_DATA            7726
V_ADMINBY              9
V_FUNDBY               4
OTHER_MEDS         11816
CUR_ILL             3518
HISTORY             9515
PRIOR_VAX           1033
SPLTTYPE            5895
TODAYS_DATE          104
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
ALLERGIES           6245
SYMPTOM1            2301
SYMPTOM2            2220
SYMPTOM3            2017
SYMPTOM4            1833
SYMPTOM5            1617
VAX_TYPE              40
VAX_MANU              14
VAX_LOT             1887
VAX_DOSE_SERIES        8
VAX_ROUTE              7
VAX_SITE               9
VAX_NAME              63


In [7]:
# DIED value counts
counts = vaers_df.DIED.value_counts()
counts

Y    2575
Name: DIED, dtype: int64

In [8]:
# LAB DATA value counts
counts = vaers_df.LAB_DATA.value_counts()
counts

None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                2684
none                                                                                                                                                                                                                                                                                                                                                                                                           

In [9]:
# OTHER MEDS value counts
counts = vaers_df.OTHER_MEDS.value_counts()
counts

None                                                                                                                                                                                                    1866
none                                                                                                                                                                                                     875
unknown                                                                                                                                                                                                  360
Unknown                                                                                                                                                                                                  359
No                                                                                                                                                                                  

In [10]:
# CUR_ILL value counts
counts = vaers_df.CUR_ILL.value_counts()
counts

None                                                                                                                                                                                              6322
none                                                                                                                                                                                              3455
No                                                                                                                                                                                                 654
no                                                                                                                                                                                                 522
NONE                                                                                                                                                                                               365
     

In [11]:
# HISTORY value counts
counts = vaers_df.HISTORY.value_counts()
counts

None                                                                                                                                                                                                                                                                                              3153
none                                                                                                                                                                                                                                                                                              1525
Comments: List of non-encoded Patient Relevant History: Patient Other Relevant History 1: None                                                                                                                                                                                                     676
Medical History/Concurrent Conditions: No adverse event (No reported medical history)                              

In [12]:
# PRIOR_VAX value counts
counts = vaers_df.PRIOR_VAX.value_counts()
counts

Flu                                                                                                                          18
unknown                                                                                                                      13
Flu Vaccine                                                                                                                  10
COVID19 (Moderna) on 1/26/2021                                                                                                9
Dizziness, mild chest pain, mid spine pain, back of neck pain, headaches, body aches,                                         8
                                                                                                                             ..
trumenba, 2017, age 23 - severe chills, fatigue, sweats following vaccine in left arm; soreness. resolved after 1-2 days.     1
vagal response similar to today                                                                         

In [13]:
#SPLTTYPE value counts
counts = vaers_df.SPLTTYPE.value_counts()
counts

USMODERNATX, INC.MOD20210    1291
vsafe                         514
VSAFE                          73
USMODERNATX, INC.MOD20200      49
USGLAXOSMITHKLINEUS2020GS      29
                             ... 
USPFIZER INC2020517208          1
USPFIZER INC2021150658          1
USPFIZER INC2021018164          1
USPFIZER INC2021111509          1
USPFIZER INC2021114966          1
Name: SPLTTYPE, Length: 5895, dtype: int64

In [14]:
#ALLERGIES value counts
counts = vaers_df.ALLERGIES.value_counts()
counts

None                                                                                 3369
none                                                                                 1619
NKA                                                                                   804
NKDA                                                                                  769
No                                                                                    401
                                                                                     ... 
Codeine intolerance, demerol lupron chichen , baker's yeast, blue cheese, peanuts       1
Sulfa-hypotension Percocet-itching                                                      1
PCN, Omnicef,  Bees = anaphylaxis Vicodin=vomit Neosporin=Hives                         1
PCN, Sulfas, erythromycin, ionic contrast dye                                           1
Indocyn                                                                                 1
Name: ALLE

In [15]:
#SYMPTOM1 value counts
counts = vaers_df.SYMPTOM1.value_counts()
counts

Chills                         2358
Arthralgia                     1516
Headache                       1197
Injection site erythema        1176
Dizziness                      1175
                               ... 
Cranial nerve paralysis           1
Respiratory tract infection       1
Cardio-respiratory distress       1
Adjustment disorder               1
Eye inflammation                  1
Name: SYMPTOM1, Length: 2301, dtype: int64

In [16]:
#SYMPTOM2 value counts
counts = vaers_df.SYMPTOM2.value_counts()
counts

Headache                         1683
Fatigue                          1301
Chills                           1118
Dizziness                         835
Pyrexia                           825
                                 ... 
Allergic respiratory symptom        1
Cutaneous symptom                   1
Postictal state                     1
Blood immunoglobulin E normal       1
Thoracic vertebral fracture         1
Name: SYMPTOM2, Length: 2220, dtype: int64

In [17]:
#VAX_LOT value counts
counts = vaers_df.VAX_LOT.value_counts()
counts

039K20A    1375
011J20A    1260
EK9231     1132
EK5730     1072
EH9899     1047
           ... 
elo142        1
Es1685        1
E19261        1
EX5730        1
EL9207        1
Name: VAX_LOT, Length: 1887, dtype: int64

In [18]:
# Drop LAB_DATA, OTHER MEDS, CUR_ILL, HISTORY, PRIOR_VAX, ALLERGIES - unless we use NLP, those columns can't be hot encoded
vaers_drop = vaers_df.drop(columns = ['LAB_DATA', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES'])

In [19]:
# Drop Unamed: ) because it is the same as the index
vaers_clean = vaers_drop.drop(columns = 'Unnamed: 0')

In [20]:
# re-count Unique values in each column in new cleaned df
vaers_clean.nunique()

VAERS_ID           27955
RECVDATE              70
STATE                 60
AGE_YRS              118
CAGE_YR              105
CAGE_MO                7
SEX                    3
RPT_DATE              20
SYMPTOM_TEXT       27195
DIED                   1
DATEDIED              95
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
HOSPDAYS              33
X_STAY                 1
DISABLE                1
RECOVD                 3
VAX_DATE             278
ONSET_DATE           206
NUMDAYS              115
V_ADMINBY              9
V_FUNDBY               4
SPLTTYPE            5895
FORM_VERS              2
TODAYS_DATE          104
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
SYMPTOM1            2301
SYMPTOMVERSION1        2
SYMPTOM2            2220
SYMPTOMVERSION2        2
SYMPTOM3            2017
SYMPTOMVERSION3        2
SYMPTOM4            1833
SYMPTOMVERSION4        2
SYMPTOM5            1617
SYMPTOMVERSION5        2
VAX_TYPE              40


In [21]:
vaers_clean.dtypes

VAERS_ID             int64
RECVDATE            object
STATE               object
AGE_YRS            float64
CAGE_YR            float64
CAGE_MO            float64
SEX                 object
RPT_DATE            object
SYMPTOM_TEXT        object
DIED                object
DATEDIED            object
L_THREAT            object
ER_VISIT            object
HOSPITAL            object
HOSPDAYS           float64
X_STAY              object
DISABLE             object
RECOVD              object
VAX_DATE            object
ONSET_DATE          object
NUMDAYS            float64
V_ADMINBY           object
V_FUNDBY            object
SPLTTYPE            object
FORM_VERS            int64
TODAYS_DATE         object
BIRTH_DEFECT        object
OFC_VISIT           object
ER_ED_VISIT         object
SYMPTOM1            object
SYMPTOMVERSION1    float64
SYMPTOM2            object
SYMPTOMVERSION2    float64
SYMPTOM3            object
SYMPTOMVERSION3    float64
SYMPTOM4            object
SYMPTOMVERSION4    float64
S

In [22]:
vaers_clean

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,916600,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,916601,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,916602,01/01/2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",,...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,916603,01/01/2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",,...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,916604,01/01/2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",,...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40256,1092595,03/11/2021,TX,70.0,70.0,,F,,A high risk employee received 1st vaccine dose...,Y,...,,,,COVID19,MODERNA,025J20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
40257,1092651,03/11/2021,CA,77.0,77.0,,M,,DEATH,Y,...,,,,COVID19,MODERNA,0124214,1,IM,LA,COVID19 (COVID19 (MODERNA))
40258,1092653,03/11/2021,CA,88.0,88.0,,F,,Patient received the vaccine on the evening of...,Y,...,,,,COVID19,MODERNA,unknown,2,SYR,AR,COVID19 (COVID19 (MODERNA))
40259,1092737,03/11/2021,ME,69.0,69.0,,M,,"Sudden death. Alone at home, found on floor 4 ...",Y,...,,,,COVID19,MODERNA,,1,UN,AR,COVID19 (COVID19 (MODERNA))


In [23]:
#Search for columns with DIED in name to find the target column (y)
died_cols = [col for col in vaers_df.columns if 'DIED' in col]
died_cols

['DIED', 'DATEDIED']

# Logistic Regression

# Split and Scale Data

In [31]:
vaers_clean['DIED'].fillna(value=0, inplace=True)
vaers_clean['DIED'].replace(to_replace='Y', value=1)

0        0
1        0
2        0
3        0
4        0
        ..
40256    1
40257    1
40258    1
40259    1
40260    1
Name: DIED, Length: 40261, dtype: int64

# Logistic Regression

In [32]:
# Split data into features and targets
y = vaers_clean['DIED']
X = vaers_clean.drop(columns = 'DIED')

In [33]:
# Counting values on y
y.value_counts()

0    37686
Y     2575
Name: DIED, dtype: int64

In [None]:
# Split into test and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Resampling the data

In [None]:
# Counting values in Y_train


In [None]:
#Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=78)
classifier

In [None]:
# Train the data
classifier.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = classifier.predict(X_test)
print(f' Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}')