In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix,\
    precision_score, recall_score, accuracy_score, f1_score, log_loss,\
    roc_curve, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
X_init = pd.read_csv('Data/training_set_features.csv', index_col=0)
y_init = pd.read_csv('Data/training_set_labels.csv', index_col=0)

In [None]:
X_init.info()

In [None]:
(X_init.isna().sum()/len(X_init)).sort_values(ascending=False)

In [None]:
X_drop = X_init.drop(['employment_occupation', 'employment_industry', 'health_insurance'], axis=1)

In [None]:
X_drop.corrwith(y_init['h1n1_vaccine']).sort_values(ascending=False)

In [None]:
X_drop.corrwith(y_init['seasonal_vaccine']).sort_values(ascending=False)

In [None]:
y_init.info()

In [None]:
y_init['h1n1_vaccine'].value_counts()

In [None]:
y_init['seasonal_vaccine'].value_counts()

In [None]:
y_drop = y_init.drop('h1n1_vaccine', axis=1)

In [None]:
y_drop

In [None]:
X_drop.corrwith(y_drop['seasonal_vaccine']).sort_values(ascending=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_drop, y_drop, random_state=50)

In [None]:
X_train

In [None]:
X_train_nums = X_train.select_dtypes(include=pd.Float64Dtype)
X_train_cat = X_train.select_dtypes(include=object) 

In [None]:
X_train_nums.isna().sum().sort_values(ascending=False)

In [None]:
numerical_pipeline = Pipeline(steps=[
    ('impute_nums', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])
#describe why median was chose
categorical_pipeline = Pipeline(steps=[
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder())])

trans = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, X_train_nums.columns),
    ('categorical', categorical_pipeline, X_train_cat.columns)
])

In [None]:
simple_model_pipe = Pipeline(steps=[
    ('trans', trans),
    ('tree', DecisionTreeClassifier(max_depth=2, random_state=50))
    ])

In [None]:
simple_model_pipe.fit(X_train, y_train)

In [None]:
simple_model_pipe.score(X_train, y_train)

In [None]:
simple_model_pipe.score(X_test, y_test)

In [None]:
simple_preds = simple_model_pipe.predict(X_test)
print(f'This is our f1 score: {f1_score(y_test, simple_preds)}')
print(f'This is our roc-auc score: {roc_auc_score(y_test, simple_preds)}')
print(f'This is our precision score: {precision_score(y_test, simple_preds)}')
print(f'This is our recall score: {recall_score(y_test, simple_preds)}')

In [None]:
simple_importance = simple_model_pipe.named_steps['tree'].feature_importances_
pd.Series(simple_importance, index=X_train.columns).sort_values(ascending=False)

In [None]:
log_model_pipe = Pipeline(steps=[
    ('trans', trans), ## need to drop out unused columns from the pipeline
    ('tree', LogisticRegression(random_state=50))
    ])

In [None]:
X_train.columns

In [None]:
log_model_pipe.fit(X_train[['opinion_seas_vacc_effective', 'doctor_recc_seasonal']], y_train)