In [None]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import pandas as pd
import tensorflow as tf


# Import and clean df

In [None]:
# Import cleaned VAERS dataset
vaers_df = pd.read_csv('VAERSDATA.csv', low_memory=False)
vaers_df.head()

In [None]:
# Check vaers_df column types
vaers_df.dtypes

In [None]:
# change dates(objects)columns to datetime
#vaers_df['RECVDATE'] = pd.to_datetime(vaers_df['RECVDATE'])
#vaers_df['RPT_DATE'] = pd.to_datetime(vaers_df['RPT_DATE'])
#vaers_df['DATEDIED'] = pd.to_datetime(vaers_df['DATEDIED'])
#vaers_df['VAX_DATE'] = pd.to_datetime(vaers_df['VAX_DATE'])
#vaers_df['ONSET_DATE'] = pd.to_datetime(vaers_df['ONSET_DATE'])
#vaers_df['TODAYS_DATE'] = pd.to_datetime(vaers_df['TODAYS_DATE'])

#vaers_df.dtypes

In [None]:
# Categorical variable list
vaers_cat = vaers_df.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

In [None]:
# Unique values in each column
vaers_df[vaers_cat].nunique()

In [None]:
# DIED value counts
counts = vaers_df.DIED.value_counts()
counts

In [None]:
# LAB DATA value counts
counts = vaers_df.LAB_DATA.value_counts()
counts

In [None]:
# OTHER MEDS value counts
counts = vaers_df.OTHER_MEDS.value_counts()
counts

In [None]:
# CUR_ILL value counts
counts = vaers_df.CUR_ILL.value_counts()
counts

In [None]:
# HISTORY value counts
counts = vaers_df.HISTORY.value_counts()
counts

In [None]:
# PRIOR_VAX value counts
counts = vaers_df.PRIOR_VAX.value_counts()
counts

In [None]:
#SPLTTYPE value counts
counts = vaers_df.SPLTTYPE.value_counts()
counts

In [None]:
#ALLERGIES value counts
counts = vaers_df.ALLERGIES.value_counts()
counts

In [None]:
#SYMPTOM1 value counts
counts = vaers_df.SYMPTOM1.value_counts()
counts

In [None]:
#SYMPTOM2 value counts
counts = vaers_df.SYMPTOM2.value_counts()
counts

In [None]:
#VAX_LOT value counts
counts = vaers_df.VAX_LOT.value_counts()
counts

In [None]:
# Drop LAB_DATA, OTHER MEDS, CUR_ILL, HISTORY, PRIOR_VAX, ALLERGIES - unless we use NLP, those columns can't be hot encoded
vaers_clean = vaers_df.drop(columns = ['LAB_DATA', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES'])

In [None]:
# re-count Unique values in each column in new cleaned df
vaers_clean.nunique()

# One Hot Encoder

In [None]:
# replace missing data
vaers_clean.fillna('None', inplace=True)

In [None]:
# Recriate the vares_clean categorical list
vaers_cat = vaers_clean.dtypes[vaers_df.dtypes == 'object'].index.tolist()
vaers_cat

In [None]:
# OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

#Fit and transform the OneHotEncoder
vaers_enc_df = pd.DataFrame(enc.fit_transform(vaers_clean[vaers_cat]))

#Add the encoded variable names to the DF
vaers_enc_df.columns = enc.get_feature_names(vaers_cat)
vaers_enc_df.head()

In [None]:
# Merge with vaers_clean and drop original cat. list
vaers_df = vaers_clean.merge(vaers_enc_df, left_index=True, right_index=True)
vaers_df = vaers_df.drop(vaers_cat,1)


In [None]:
# clean vaers_df dropping Unamed first columns (= index) and DIED_None
vaers_df = vaers_df.drop(columns = 'Unnamed: 0')


# Split Data

In [None]:
# Split data into features and targets
y = vaers_df['DIED_Y'].values
X = vaers_df.drop(columns = 'DIED_None').values

In [None]:
X

In [None]:
# Split into test and training
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)