In [34]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
import tensorflow as tf
import re


In [35]:
# Import cleaned VAERS dataset
vaers_df = pd.read_csv('vaers_data_cleaned.csv', low_memory=False)
print(vaers_df.shape)
vaers_df.head(5)

(40247, 49)


Unnamed: 0,vaers_id,recvdate,state_,age_yrs,cage_yr,sex,symptom_text,died,datedied,l_threat,...,symptomversion4,symptom5,symptomversion5,vax_type,vax_manu,vax_lot,vax_dose_series,vax_route,vax_site,vax_name
0,1000000,02/04/2021,CA,,,M,tested positive; tested positive; This is a sp...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,,COVID19 (COVID19 (PFIZER-BIONTECH))
1,1000001,02/04/2021,WI,,,F,covid symptoms the 28th and tested positive; c...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,,COVID19 (COVID19 (PFIZER-BIONTECH))
2,1000003,02/04/2021,PA,29.0,,F,rash and hives all over body; rash and hives a...,,,,...,,,,COVID19,PFIZER\BIONTECH,EJ1685,1,OT,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,1000006,02/04/2021,,,,F,tested positive for covid; tested positive for...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,,COVID19 (COVID19 (PFIZER-BIONTECH))
4,1000007,02/04/2021,WI,53.0,,M,blood sugar has been out of control ranging fr...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,RA,COVID19 (COVID19 (PFIZER-BIONTECH))


In [36]:
# Drop duplicates in the dataframe
vaers_df.drop_duplicates(inplace=True)
print(vaers_df.shape)
vaers_df.head(5)

(40209, 49)


Unnamed: 0,vaers_id,recvdate,state_,age_yrs,cage_yr,sex,symptom_text,died,datedied,l_threat,...,symptomversion4,symptom5,symptomversion5,vax_type,vax_manu,vax_lot,vax_dose_series,vax_route,vax_site,vax_name
0,1000000,02/04/2021,CA,,,M,tested positive; tested positive; This is a sp...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,,COVID19 (COVID19 (PFIZER-BIONTECH))
1,1000001,02/04/2021,WI,,,F,covid symptoms the 28th and tested positive; c...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,,COVID19 (COVID19 (PFIZER-BIONTECH))
2,1000003,02/04/2021,PA,29.0,,F,rash and hives all over body; rash and hives a...,,,,...,,,,COVID19,PFIZER\BIONTECH,EJ1685,1,OT,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,1000006,02/04/2021,,,,F,tested positive for covid; tested positive for...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,,COVID19 (COVID19 (PFIZER-BIONTECH))
4,1000007,02/04/2021,WI,53.0,,M,blood sugar has been out of control ranging fr...,,,,...,,,,COVID19,PFIZER\BIONTECH,,1,,RA,COVID19 (COVID19 (PFIZER-BIONTECH))


In [37]:
#Keep only the age, gender, died, and vax name columns for unsupervised model then remove null values and convert to numeric. 
vaers_df_UML = vaers_df[['vaers_id','age_yrs', 'sex', 'died', 'vax_name' ]]
vaers_df_UML.head(10)


Unnamed: 0,vaers_id,age_yrs,sex,died,vax_name
0,1000000,,M,,COVID19 (COVID19 (PFIZER-BIONTECH))
1,1000001,,F,,COVID19 (COVID19 (PFIZER-BIONTECH))
2,1000003,29.0,F,,COVID19 (COVID19 (PFIZER-BIONTECH))
3,1000006,,F,,COVID19 (COVID19 (PFIZER-BIONTECH))
4,1000007,53.0,M,,COVID19 (COVID19 (PFIZER-BIONTECH))
5,1000012,,F,,COVID19 (COVID19 (PFIZER-BIONTECH))
6,1000013,,M,,COVID19 (COVID19 (PFIZER-BIONTECH))
7,1000015,,F,,COVID19 (COVID19 (PFIZER-BIONTECH))
8,1000021,62.0,F,,COVID19 (COVID19 (PFIZER-BIONTECH))
9,1000031,,F,,COVID19 (COVID19 (PFIZER-BIONTECH))


In [38]:
# Check vaers_df column types
vaers_df_UML.dtypes

vaers_id      int64
age_yrs     float64
sex          object
died         object
vax_name     object
dtype: object

In [39]:
vaers_df_UML['died'].fillna(value='N', inplace=True)
vaers_df_UML.died

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


0        N
1        N
2        N
3        N
4        N
5        N
6        N
7        N
8        N
9        N
10       N
11       N
12       N
13       N
14       N
15       N
16       N
17       N
18       N
19       N
20       N
21       N
22       N
23       N
24       N
25       N
26       N
27       N
28       N
29       N
30       N
31       N
32       N
33       N
34       N
35       N
36       N
37       N
38       N
39       N
40       N
41       N
42       N
43       N
44       N
45       N
46       N
47       N
48       N
49       N
50       N
51       N
52       N
53       N
54       N
55       N
56       N
57       N
58       N
59       N
60       N
61       N
62       N
63       N
64       N
65       N
66       N
67       N
68       N
69       N
70       N
71       N
72       N
73       N
74       N
75       Y
76       N
77       Y
78       N
79       N
80       Y
81       N
82       N
83       N
84       N
85       N
86       N
87       N
88       N
89       N
90       N

In [41]:
vaers_UML = vaers_df_UML.dropna()
vaers_UML.head(10)

Unnamed: 0,vaers_id,age_yrs,sex,died,vax_name
2,1000003,29.0,F,N,COVID19 (COVID19 (PFIZER-BIONTECH))
4,1000007,53.0,M,N,COVID19 (COVID19 (PFIZER-BIONTECH))
8,1000021,62.0,F,N,COVID19 (COVID19 (PFIZER-BIONTECH))
10,1000036,38.0,M,N,COVID19 (COVID19 (PFIZER-BIONTECH))
13,1000046,86.0,M,N,COVID19 (COVID19 (PFIZER-BIONTECH))
14,1000051,70.0,F,N,COVID19 (COVID19 (PFIZER-BIONTECH))
15,1000051,70.0,F,N,COVID19 (COVID19 (PFIZER-BIONTECH))
16,1000051,70.0,F,N,COVID19 (COVID19 (PFIZER-BIONTECH))
19,1000074,80.0,F,N,COVID19 (COVID19 (PFIZER-BIONTECH))
20,1000074,80.0,F,N,COVID19 (COVID19 (PFIZER-BIONTECH))


In [42]:
vaers_UML.sex.value_counts()

F    26778
M     9902
U      176
Name: sex, dtype: int64

In [43]:

vaers_UML.died.value_counts()

N    34430
Y     2426
Name: died, dtype: int64

In [44]:
pd.options.display.max_rows = None
vaers_UML.vax_name.value_counts()

COVID19 (COVID19 (MODERNA))                              18546
COVID19 (COVID19 (PFIZER-BIONTECH))                      17460
COVID19 (COVID19 (JANSSEN))                                401
VACCINE NOT SPECIFIED (NO BRAND NAME)                       95
ZOSTER (SHINGRIX)                                           62
COVID19 (COVID19 (UNKNOWN))                                 30
INFLUENZA (SEASONAL) (FLUZONE QUADRIVALENT)                 25
PNEUMO (PNEUMOVAX)                                          23
INFLUENZA (SEASONAL) (NO BRAND NAME)                        21
VACCINE NOT SPECIFIED (OTHER)                               15
INFLUENZA (SEASONAL) (FLUZONE HIGH-DOSE QUADRIVALENT)       15
MEASLES + MUMPS + RUBELLA (MMR II)                          13
ZOSTER LIVE (ZOSTAVAX)                                      12
INFLUENZA (SEASONAL) (FLUBLOK QUADRIVALENT)                 11
VARICELLA (VARIVAX)                                          8
INFLUENZA (SEASONAL) (FLUAD QUADRIVALENT)              

In [45]:
#Export CSV for dashboard purposes
vaers_UML.to_csv('VAERS_UML_cleaned.csv')

In [None]:
#Remove all values in vax_name that do not include 'COVID19' using REGEX, in order to select only the top three unique values in 
#vax_name. Therefore the only three vaccines will be three covid19 vaccines; moderna, pfizer-biontech, and janssen. 
regex = r' \b(\w*COVID19\w*)\b'

In [None]:
#Use REGEX to filter for only COVID19 
vax_names = vaers_UML['vax_name']
vax_names.str.contains(regex, flags=re.IGNORECASE).sum()

In [None]:
#Convert sex column from object to boolean numerical values for unsupervised model
def change_sex(sex):
    if sex == "M":
        return 1
    if sex == "F"
        return 2
    else: 
        return 3
    
vaers_UML ["sex"] = vaers_UML["sex"].apply(change_sex)
vaers_UML.head(10)

In [None]:
#Convert died column from object to boolean numerical values for unsupervised model
def change_died(died):
    if died == "N":
        return 1
    else: 
        return 2
    
vaers_UML ["died"] = vaers_UML["died"].apply(change_died)
vaers_UML.head(10)

In [None]:
#Convert vax_name column from object to numerical values for unsupervised model, representing each vaccine (Moderna, Pfizer, Janssen)

In [None]:
# Conduct unsupervised machine learning model
