In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import tensorflow as tf


# Import and clean df

In [2]:
# Import cleaned VAERS dataset
vaers_df = pd.read_csv('VAERSDATA.csv', low_memory=False)
vaers_df.head()

Unnamed: 0.1,Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,0,916600,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,1,916601,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,2,916602,01/01/2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,3,916603,01/01/2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,4,916604,01/01/2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))


In [3]:
# Check vaers_df column types
vaers_df.dtypes

Unnamed: 0           int64
VAERS_ID             int64
RECVDATE            object
STATE               object
AGE_YRS            float64
CAGE_YR            float64
CAGE_MO            float64
SEX                 object
RPT_DATE            object
SYMPTOM_TEXT        object
DIED                object
DATEDIED            object
L_THREAT            object
ER_VISIT            object
HOSPITAL            object
HOSPDAYS           float64
X_STAY              object
DISABLE             object
RECOVD              object
VAX_DATE            object
ONSET_DATE          object
NUMDAYS            float64
LAB_DATA            object
V_ADMINBY           object
V_FUNDBY            object
OTHER_MEDS          object
CUR_ILL             object
HISTORY             object
PRIOR_VAX           object
SPLTTYPE            object
FORM_VERS            int64
TODAYS_DATE         object
BIRTH_DEFECT        object
OFC_VISIT           object
ER_ED_VISIT         object
ALLERGIES           object
SYMPTOM1            object
S

In [4]:
# change dates(objects)columns to datetime
#vaers_df['RECVDATE'] = pd.to_datetime(vaers_df['RECVDATE'])
#vaers_df['RPT_DATE'] = pd.to_datetime(vaers_df['RPT_DATE'])
#vaers_df['DATEDIED'] = pd.to_datetime(vaers_df['DATEDIED'])
#vaers_df['VAX_DATE'] = pd.to_datetime(vaers_df['VAX_DATE'])
#vaers_df['ONSET_DATE'] = pd.to_datetime(vaers_df['ONSET_DATE'])
#vaers_df['TODAYS_DATE'] = pd.to_datetime(vaers_df['TODAYS_DATE'])

#vaers_df.dtypes

In [5]:
# Categorical variable list
vaers_cat = vaers_df.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

['RECVDATE',
 'STATE',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'LAB_DATA',
 'V_ADMINBY',
 'V_FUNDBY',
 'OTHER_MEDS',
 'CUR_ILL',
 'HISTORY',
 'PRIOR_VAX',
 'SPLTTYPE',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'ALLERGIES',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [6]:
# Unique values in each column
vaers_df[vaers_cat].nunique()

RECVDATE              70
STATE                 60
SEX                    3
RPT_DATE              20
SYMPTOM_TEXT       27195
DIED                   1
DATEDIED              95
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
X_STAY                 1
DISABLE                1
RECOVD                 3
VAX_DATE             278
ONSET_DATE           206
LAB_DATA            7726
V_ADMINBY              9
V_FUNDBY               4
OTHER_MEDS         11816
CUR_ILL             3518
HISTORY             9515
PRIOR_VAX           1033
SPLTTYPE            5895
TODAYS_DATE          104
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
ALLERGIES           6245
SYMPTOM1            2301
SYMPTOM2            2220
SYMPTOM3            2017
SYMPTOM4            1833
SYMPTOM5            1617
VAX_TYPE              40
VAX_MANU              14
VAX_LOT             1887
VAX_DOSE_SERIES        8
VAX_ROUTE              7
VAX_SITE               9
VAX_NAME              63


In [7]:
# DIED value counts
counts = vaers_df.DIED.value_counts()
counts

Y    2575
Name: DIED, dtype: int64

In [8]:
# LAB DATA value counts
counts = vaers_df.LAB_DATA.value_counts()
counts

None                                                                                                                                                          2684
none                                                                                                                                                          1557
no                                                                                                                                                             222
NONE                                                                                                                                                           192
None.                                                                                                                                                          171
                                                                                                                                                              ... 
CBC, CMP, COVID-19 PCR

In [9]:
# OTHER MEDS value counts
counts = vaers_df.OTHER_MEDS.value_counts()
counts

None                                                                                1866
none                                                                                 875
unknown                                                                              360
Unknown                                                                              359
No                                                                                   131
                                                                                    ... 
Thyroxine Tabs - 100mcg: Lipitor - 10 mg Triamterine/HCTZ-50/25mg                      1
Colace, Miralax, Ocuvite                                                               1
SODIUM CHLORIDE                                                                        1
Singulair 10mg                                                                         1
Aspirin 81mg daily Levothyroxine 137mcg daily Tylenol 1000mg as needed for fever       1
Name: OTHER_MEDS, Len

In [10]:
# CUR_ILL value counts
counts = vaers_df.CUR_ILL.value_counts()
counts

None                                                6322
none                                                3455
No                                                   654
no                                                   522
NONE                                                 365
                                                    ... 
Mental Health Issue                                    1
High blood pressure/ Celiac Disease/Glaucoma OS/       1
weakness, protein-calroie malnutrition                 1
Pt denies                                              1
No other illness reported.                             1
Name: CUR_ILL, Length: 3518, dtype: int64

In [11]:
# HISTORY value counts
counts = vaers_df.HISTORY.value_counts()
counts

None                                                                                              3153
none                                                                                              1525
Comments: List of non-encoded Patient Relevant History: Patient Other Relevant History 1: None     676
Medical History/Concurrent Conditions: No adverse event (No reported medical history)              375
No                                                                                                 339
                                                                                                  ... 
Reactive Airway Disease, anxiety                                                                     1
Medical History/Concurrent Conditions: Stroke (Hospitalized)                                         1
asthma, CHF, Hypokalemia, hypothyroidism                                                             1
asthma, vasomotor symptoms and left lower lobe respiratory issues that ar

In [12]:
# PRIOR_VAX value counts
counts = vaers_df.PRIOR_VAX.value_counts()
counts

Flu                                                                                                  18
unknown                                                                                              13
Flu Vaccine                                                                                          10
COVID19 (Moderna) on 1/26/2021                                                                        9
Dizziness, mild chest pain, mid spine pain, back of neck pain, headaches, body aches,                 8
                                                                                                     ..
24 hour flu like symptoms after some yearly flu shots                                                 1
Influenza vaccination, 11/02/2020, sanofi, 33y, erythema, swelling, warmth                            1
Yes reddness and swelling at injection site,16 years,tetanus vaccine                                  1
FLU, SORE ARM 1-2 WEEK AFTER                                    

In [13]:
#SPLTTYPE value counts
counts = vaers_df.SPLTTYPE.value_counts()
counts

USMODERNATX, INC.MOD20210    1291
vsafe                         514
VSAFE                          73
USMODERNATX, INC.MOD20200      49
USGLAXOSMITHKLINEUS2020GS      29
                             ... 
USPFIZER INC2021229450          1
MS0012021                       1
USPFIZER INC2021130153          1
USPFIZER INC2021042223          1
USPFIZER INC2021145090          1
Name: SPLTTYPE, Length: 5895, dtype: int64

In [14]:
#ALLERGIES value counts
counts = vaers_df.ALLERGIES.value_counts()
counts

None                              3369
none                              1619
NKA                                804
NKDA                               769
No                                 401
                                  ... 
Penicilline, eggs and seafood        1
Pseudoephedrine Tetracycline         1
Bee Stings, Statins, Flecinide       1
Naldecon 1980s                       1
Ceftiaxone (Rocephin)                1
Name: ALLERGIES, Length: 6245, dtype: int64

In [15]:
#SYMPTOM1 value counts
counts = vaers_df.SYMPTOM1.value_counts()
counts

Chills                            2358
Arthralgia                        1516
Headache                          1197
Injection site erythema           1176
Dizziness                         1175
                                  ... 
Electrocardiogram QT prolonged       1
Adjusted calcium increased           1
Bladder injury                       1
Skin disorder                        1
Incarcerated hernia                  1
Name: SYMPTOM1, Length: 2301, dtype: int64

In [16]:
#SYMPTOM2 value counts
counts = vaers_df.SYMPTOM2.value_counts()
counts

Headache                        1683
Fatigue                         1301
Chills                          1118
Dizziness                        835
Pyrexia                          825
                                ... 
Coronary artery bypass             1
Carotid arteriosclerosis           1
Small intestinal obstruction       1
Embolism                           1
Bradykinesia                       1
Name: SYMPTOM2, Length: 2220, dtype: int64

In [17]:
#VAX_LOT value counts
counts = vaers_df.VAX_LOT.value_counts()
counts

039K20A            1375
011J20A            1260
EK9231             1132
EK5730             1072
EH9899             1047
                   ... 
O29K20A               1
#027L20A              1
037k2oa               1
026L20A EXP 6/2       1
039C20-2A             1
Name: VAX_LOT, Length: 1887, dtype: int64

In [18]:
# Drop LAB_DATA, OTHER MEDS, CUR_ILL, HISTORY, PRIOR_VAX, ALLERGIES - unless we use NLP, those columns can't be hot encoded
vaers_drop = vaers_df.drop(columns = ['LAB_DATA', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES'])

In [19]:
# Drop Unamed: ) because it is the same as the index and VAERS_ID
vaers_clean = vaers_drop.drop(columns = ['Unnamed: 0', 'VAERS_ID'])

In [20]:
# re-count Unique values in each column in new cleaned df
vaers_clean.nunique()

RECVDATE              70
STATE                 60
AGE_YRS              118
CAGE_YR              105
CAGE_MO                7
SEX                    3
RPT_DATE              20
SYMPTOM_TEXT       27195
DIED                   1
DATEDIED              95
L_THREAT               1
ER_VISIT               1
HOSPITAL               1
HOSPDAYS              33
X_STAY                 1
DISABLE                1
RECOVD                 3
VAX_DATE             278
ONSET_DATE           206
NUMDAYS              115
V_ADMINBY              9
V_FUNDBY               4
SPLTTYPE            5895
FORM_VERS              2
TODAYS_DATE          104
BIRTH_DEFECT           1
OFC_VISIT              1
ER_ED_VISIT            1
SYMPTOM1            2301
SYMPTOMVERSION1        2
SYMPTOM2            2220
SYMPTOMVERSION2        2
SYMPTOM3            2017
SYMPTOMVERSION3        2
SYMPTOM4            1833
SYMPTOMVERSION4        2
SYMPTOM5            1617
SYMPTOMVERSION5        2
VAX_TYPE              40
VAX_MANU              14


In [21]:
#Search for columns with DIED in name to find the target column (y)
died_cols = [col for col in vaers_df.columns if 'DIED' in col]
died_cols

['DIED', 'DATEDIED']

In [22]:
# preapre target column (y)
vaers_clean['DIED'].fillna(value='N', inplace=True)
vaers_clean

Unnamed: 0,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,DATEDIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,N,,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,N,,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,01/01/2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",N,,...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,01/01/2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",N,,...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,01/01/2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",N,,...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40256,03/11/2021,TX,70.0,70.0,,F,,A high risk employee received 1st vaccine dose...,Y,,...,,,,COVID19,MODERNA,025J20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
40257,03/11/2021,CA,77.0,77.0,,M,,DEATH,Y,02/27/2021,...,,,,COVID19,MODERNA,0124214,1,IM,LA,COVID19 (COVID19 (MODERNA))
40258,03/11/2021,CA,88.0,88.0,,F,,Patient received the vaccine on the evening of...,Y,03/10/2021,...,,,,COVID19,MODERNA,unknown,2,SYR,AR,COVID19 (COVID19 (MODERNA))
40259,03/11/2021,ME,69.0,69.0,,M,,"Sudden death. Alone at home, found on floor 4 ...",Y,03/11/2021,...,,,,COVID19,MODERNA,,1,UN,AR,COVID19 (COVID19 (MODERNA))


# One Hot Encoder

In [23]:
# Use fillna to address NaN values
vaers_clean.fillna("none", inplace=True)
vaers_clean


Unnamed: 0,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,DATEDIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,01/01/2021,TX,33.0,33.0,none,F,none,Right side of epiglottis swelled up and hinder...,N,none,...,none,none,none,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,01/01/2021,CA,73.0,73.0,none,F,none,Approximately 30 min post vaccination administ...,N,none,...,none,none,none,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,01/01/2021,WA,23.0,23.0,none,F,none,"About 15 minutes after receiving the vaccine, ...",N,none,...,23.1,none,none,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,01/01/2021,WA,58.0,58.0,none,F,none,"extreme fatigue, dizziness,. could not lift my...",N,none,...,none,none,none,COVID19,MODERNA,unknown,UNK,none,none,COVID19 (COVID19 (MODERNA))
4,01/01/2021,TX,47.0,47.0,none,F,none,"Injection site swelling, redness, warm to the ...",N,none,...,23.1,none,none,COVID19,MODERNA,none,1,IM,LA,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40256,03/11/2021,TX,70.0,70.0,none,F,none,A high risk employee received 1st vaccine dose...,Y,none,...,none,none,none,COVID19,MODERNA,025J20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
40257,03/11/2021,CA,77.0,77.0,none,M,none,DEATH,Y,02/27/2021,...,none,none,none,COVID19,MODERNA,0124214,1,IM,LA,COVID19 (COVID19 (MODERNA))
40258,03/11/2021,CA,88.0,88.0,none,F,none,Patient received the vaccine on the evening of...,Y,03/10/2021,...,none,none,none,COVID19,MODERNA,unknown,2,SYR,AR,COVID19 (COVID19 (MODERNA))
40259,03/11/2021,ME,69.0,69.0,none,M,none,"Sudden death. Alone at home, found on floor 4 ...",Y,03/11/2021,...,none,none,none,COVID19,MODERNA,none,1,UN,AR,COVID19 (COVID19 (MODERNA))


In [24]:
# Recriate the vares_clean categorical list
vaers_cat = vaers_clean.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

['RECVDATE',
 'STATE',
 'SEX',
 'RPT_DATE',
 'SYMPTOM_TEXT',
 'DIED',
 'DATEDIED',
 'L_THREAT',
 'ER_VISIT',
 'HOSPITAL',
 'X_STAY',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'ONSET_DATE',
 'V_ADMINBY',
 'V_FUNDBY',
 'SPLTTYPE',
 'TODAYS_DATE',
 'BIRTH_DEFECT',
 'OFC_VISIT',
 'ER_ED_VISIT',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5',
 'VAX_TYPE',
 'VAX_MANU',
 'VAX_LOT',
 'VAX_DOSE_SERIES',
 'VAX_ROUTE',
 'VAX_SITE',
 'VAX_NAME']

In [25]:
# OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

#Fit and transform the OneHotEncoder
vaers_enc_df = pd.DataFrame(enc.fit_transform(vaers_clean[vaers_cat]))

#Add the encoded variable names to the DF
vaers_enc_df.columns = enc.get_feature_names(vaers_cat)
vaers_enc_df.head()

Unnamed: 0,RECVDATE_01/01/2021,RECVDATE_01/02/2021,RECVDATE_01/03/2021,RECVDATE_01/04/2021,RECVDATE_01/05/2021,RECVDATE_01/06/2021,RECVDATE_01/07/2021,RECVDATE_01/08/2021,RECVDATE_01/09/2021,RECVDATE_01/10/2021,...,VAX_NAME_TDAP (NO BRAND NAME),VAX_NAME_TETANUS TOXOID (NO BRAND NAME),VAX_NAME_TYPHOID LIVE ORAL TY21A (VIVOTIF),VAX_NAME_VACCINE NOT SPECIFIED (NO BRAND NAME),VAX_NAME_VACCINE NOT SPECIFIED (OTHER),VAX_NAME_VARICELLA (VARIVAX),VAX_NAME_YELLOW FEVER (YF-VAX),VAX_NAME_ZOSTER (NO BRAND NAME),VAX_NAME_ZOSTER (SHINGRIX),VAX_NAME_ZOSTER LIVE (ZOSTAVAX)
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Merge with vaers_clean and drop original cat. list
vaers_df = vaers_clean.merge(vaers_enc_df, left_index=True, right_index=True)
vaers_df = vaers_df.drop(vaers_cat,1)

# Split and Scale Data

In [27]:
#Search for columns with DIED in name to find the target column (y)
died_cols = [col for col in vaers_df.columns if 'DIED' in col]
died_cols

['SYMPTOM_TEXT_DEATH Narrative: NO ADDITIONAL DETAIL PROVIDED OTHER THAN PATIENT DIED AT HOME',
 'SYMPTOM_TEXT_DIED',
 'SYMPTOM_TEXT_DIED WITHIN 5 DAYS OF RECEIEVING THE 2ND DOSE, EXPERIENCED GENERALIZED WEAKNESS.',
 'SYMPTOM_TEXT_MY WIFE DIED UNEXPECTEDLY 4 DAYS AFTER HER SECOND DOSAGE SHOT, ON FEBRUARY 17, 2021.  SHE HAD BEEN HEALTHY AND HAD A RECENT CHECKUP AT WHICH THE DOCTOR GAVE  HER A CLEAN BILL OF HEALTH.  SHE WAS ALERT AND IN GOOD SPIRITS JUST THE NIGHT BEFORE WHEN WE WATCHED A MOVIE TOGETHER.  I SAW NO INDICATION THAT SHE WAS FEELING POORLY OR OTHERWISE.  I FOUND HER IN BED, DECEASED, UPON COMING HOME FROM WORK THE NEXT DAY.',
 'SYMPTOM_TEXT_PATIENT DIED IN HIS SLEEP NIGHT AFTER ADMINISTRATION',
 'SYMPTOM_TEXT_PATIENT WAS ADMITTED TO ER FOR  ALTERED MENTAL STATUS  / UTI  SEPSIS  WITH SEPTIC SHOCK / COVID AND COVID   PNA   PATIENT WAS ADMITTED TO ICU AND DIED .   POA WISH TO WITHDRAWL EXTRME MEASURES',
 'SYMPTOM_TEXT_Patient reported to Emergency room on 01/23/2021 with compla

In [28]:
# Counting values in DIED_Y
counts = vaers_df.DIED_Y.value_counts()
counts

0.0    37686
1.0     2575
Name: DIED_Y, dtype: int64

In [29]:
# Split data into features and targets
y = vaers_df['DIED_Y']
X = vaers_df.drop(columns = ['DIED_Y', 'DIED_N'])

In [30]:
# Use fillna to address NaN values
X.replace(to_replace = "none", value=0, inplace=True)

In [31]:
# Counting values on y
y.value_counts()

0.0    37686
1.0     2575
Name: DIED_Y, dtype: int64

In [32]:
# Split into test and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)
y_train

17321    0.0
19487    0.0
34584    0.0
30537    0.0
25057    0.0
        ... 
22633    0.0
36904    0.0
38807    1.0
16854    0.0
39621    1.0
Name: DIED_Y, Length: 30195, dtype: float64

In [33]:
# Check y_train
y_train.value_counts()

0.0    28303
1.0     1892
Name: DIED_Y, dtype: int64

In [34]:
X_train.select_dtypes(include = ['float64'])

Unnamed: 0,AGE_YRS,CAGE_YR,CAGE_MO,HOSPDAYS,NUMDAYS,SYMPTOMVERSION1,SYMPTOMVERSION2,SYMPTOMVERSION3,SYMPTOMVERSION4,SYMPTOMVERSION5,...,VAX_NAME_TDAP (NO BRAND NAME),VAX_NAME_TETANUS TOXOID (NO BRAND NAME),VAX_NAME_TYPHOID LIVE ORAL TY21A (VIVOTIF),VAX_NAME_VACCINE NOT SPECIFIED (NO BRAND NAME),VAX_NAME_VACCINE NOT SPECIFIED (OTHER),VAX_NAME_VARICELLA (VARIVAX),VAX_NAME_YELLOW FEVER (YF-VAX),VAX_NAME_ZOSTER (NO BRAND NAME),VAX_NAME_ZOSTER (SHINGRIX),VAX_NAME_ZOSTER LIVE (ZOSTAVAX)
17321,32.0,32.0,0.0,2.0,1.0,23.1,23.1,23.1,23.1,23.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19487,52.0,52.0,0.0,3.0,15.0,23.1,23.1,23.1,23.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34584,83.0,83.0,0.0,3.0,1.0,23.1,23.1,23.1,23.1,23.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30537,75.0,75.0,0.0,0.0,0.0,23.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25057,80.0,80.0,0.0,0.0,0.0,23.1,23.1,23.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22633,88.0,88.0,0.0,1.0,10.0,23.1,23.1,23.1,23.1,23.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36904,78.0,78.0,0.0,18.0,4.0,24.0,24.0,24.0,24.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38807,81.0,81.0,0.0,0.0,1.0,24.0,24.0,24.0,24.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16854,43.0,42.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
y_train

17321    0.0
19487    0.0
34584    0.0
30537    0.0
25057    0.0
        ... 
22633    0.0
36904    0.0
38807    1.0
16854    0.0
39621    1.0
Name: DIED_Y, Length: 30195, dtype: float64

# Logistic Regression

In [36]:
#Create a Logistic Regression Model
classifier = LogisticRegression(solver='liblinear', max_iter=200, random_state=78)
classifier

LogisticRegression(max_iter=200, random_state=78, solver='liblinear')

In [37]:
# Train the data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=78, solver='liblinear')

In [38]:
# Evaluate
y_pred = classifier.predict(X_test)
print(f' Logistic regression model accuracy: {accuracy_score(y_test, y_pred):.3f}')

 Logistic regression model accuracy: 0.999


In [39]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[9383,    0],
       [  14,  669]])

In [40]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9897510980966325

In [41]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      9383
         1.0       1.00      0.98      0.99       683

    accuracy                           1.00     10066
   macro avg       1.00      0.99      0.99     10066
weighted avg       1.00      1.00      1.00     10066



In [42]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      1.00      0.98      1.00      0.99      0.98      9383
        1.0       1.00      0.98      1.00      0.99      0.99      0.98       683

avg / total       1.00      1.00      0.98      1.00      0.99      0.98     10066



In [43]:
X_train.shape

(30195, 46001)

In [44]:
y_train.value_counts()

0.0    28303
1.0     1892
Name: DIED_Y, dtype: int64

# Resampling the training data

In [45]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0.0: 28303, 1.0: 28303})

In [46]:
# Logistic regression using random oversampled data
classover = LogisticRegression(solver='liblinear', max_iter=200, random_state=78)
classover.fit(X_resampled, y_resampled)
y_pred = classover.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[9381,    2],
       [   9,  674]])

In [47]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      1.00      0.99      1.00      0.99      0.99      9383
        1.0       1.00      0.99      1.00      0.99      0.99      0.99       683

avg / total       1.00      1.00      0.99      1.00      0.99      0.99     10066



## Evaluating the Logistic Regression model with k-Fold Cross-Validation


In [50]:
from numpy import mean
from numpy import std
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [54]:
# evaluate "classifier" model
scores = cross_val_score(classifier,X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

In [52]:
# evaluate oversampled "cassover" model
scores2 = cross_val_score(classover, X_resampled, y_resampled, scoring='accuracy', cv=cv, n_jobs=-1)

KeyboardInterrupt: 

In [53]:
# report performance
print('Original accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
print('Oversampled accuracy: %.3f (%.3f)' % (mean(scores2), std(scores2)))

Original accuracy: 0.999 (0.001)


NameError: name 'scores2' is not defined

# Neural Network

In [48]:
#NN MODEL
nn_model= tf.keras.models.Sequential()
nn_model.add(
    tf.keras.layers.Dense(units=16, 
                          activation='relu',
                         input_dim=8)
)
nn_model.add(
    tf.keras.layers.Dense(units=1, 
                          activation='sigmoid',
                         )
)

#Compile Sequential and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])

# Train
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

#Evaluate
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled, y_test, verbose=2)
print(f'Loss: {model_loss}, Accuracy: {model_accuracy}')

NameError: name 'X_train_scaled' is not defined