# Flu Vaccination Modelling

### EDA

In [1]:
!ls

Brooke EDA.ipynb      Project3_Brooke.ipynb


In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats as stats

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, RandomizedSearchCV

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [5]:
# Install category_encoders for frequency binning
!conda install -c conda-forge category_encoders

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.10.1
  latest version: 4.10.3

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /Users/brookesmyth/opt/anaconda3/envs/learn-env

  added / updated specs:
    - category_encoders


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2021.10.8  |       h033912b_0         139 KB  conda-forge
    category_encoders-2.3.0    |     pyhd8ed1ab_0          57 KB  conda-forge
    certifi-2021.10.8          |   py38h50d1736_0         145 KB  conda-forge
    openssl-1.1.1l             |       h0d85af4_0         1.9 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.2 MB

The following NEW packages will be INSTALLED:

  ca

In [6]:
# Now import the category encoder tool
import category_encoders as ce

ModuleNotFoundError: No module named 'category_encoders'

In [None]:
feat = pd.read_csv('../../Data/training_set_features.csv')
feat.head()

In [None]:
feat.columns

In [None]:
feat.shape

In [None]:
feat.describe()

In [None]:
# Importing target data
tar = pd.read_csv('../../Data/training_set_labels.csv')
tar.head()

In [None]:
# Looking at value counts, for general info and to see which columns need frequency encoder
for c in feat.columns.drop('respondent_id'):
    print(feat[c].value_counts())
# hhs_geo_region, employment_industry, and employment occupation need frequency encoder

In [None]:
feat.columns.drop('respondent_id')

In [None]:
feat.info()

In [None]:
# Set up lists for columns which will need different imputations in pipeline
numeric = []
cat = []
freq = []

for i in feat.columns.drop('respondent_id'):
    if feat[i].dtype in ['float64', 'int64']:
        numeric.append(i)
    elif feat[i].nunique() < 10:
        cat.append(c)
    else:
        freq.append(i)

In [None]:
# Set up entries for the pipeline

num_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median'))
    ])

ohe_transformer = Pipeline(steps=[
    ('ohe_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('oh_encoder', OneHotEncoder(handle_unknown='ignore'))
])

freq_transformer = Pipeline(steps=[
    ('freq_encoder', ce.count.CountEncoder(normalize=True, min_group_size=.05)),
    ('freq_imputer', SimpleImputer(strategy='constant', fill_value=0))
])

In [None]:
# Group everything together with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, ohe_cols),
        ('freq', freq_transformer, freq_cols)
    ])
