### Data Prep/Feature Eng
* use cross val for tuning and selecting hyperparameters
* use test set at very end on best model
* find optimal complexityq to balance bias variance

### TRY ALL MODELS - but have rationale on why you are trying models
* document the iterative process

### Deliverables
* Contract by monday 2pm mountain time - communication frequency and tangible deadlines
* model completed by monday EOD
* proof of concept that your target and predictors are fit for machine learning classification
* decide as a team if target can be used as is or needs to be transformed
* FSM

### GROUP: Evan, Drew, Mustafa

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


%matplotlib inline

In [None]:
df_var = pd.read_csv('data/training_set_features.csv')
df_tar = pd.read_csv('data/training_set_labels.csv')['seasonal_vaccine']

In [None]:
# Drop Based On Relevance
df_var = df_var.drop(['respondent_id','h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','doctor_recc_h1n1','hhs_geo_region'],axis=1)

In [None]:
percent_nan = df_var.isna().sum() / df_var.shape[0] * 100
percent_nan.map(round)[percent_nan > 10]

In [None]:
# Drop based on Nan
df_var = df_var.drop(['health_insurance','income_poverty','employment_industry','employment_occupation'],axis=1)

### Train Test Split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df_var,df_tar,random_state=42)
X_train = X_train.copy(deep=True)

In [None]:
frequent_columns = ['behavioral_antiviral_meds', 
                    'behavioral_avoidance',
                    'behavioral_face_mask', 
                    'behavioral_wash_hands',
                    'behavioral_large_gatherings', 
                    'behavioral_outside_home',
                    'behavioral_touch_face', 
                    'doctor_recc_seasonal',
                    'chronic_med_condition', 
                    'child_under_6_months', 
                    'health_worker',
                    'education', 
                    'rent_or_own', 
                    'marital_status', 
                    'employment_status',
                    'sex']
                                                     
median_columns = ['opinion_seas_vacc_effective', 
                  'opinion_seas_risk',
                  'opinion_seas_sick_from_vacc',
                  'household_adults', 
                  'household_children']

ohe_cols = ['opinion_seas_vacc_effective', 
            'opinion_seas_risk',
            'opinion_seas_sick_from_vacc',
            'age_group','education',
            'race',
            'employment_status', 
            'census_msa']

oe_cols = ['sex','marital_status','rent_or_own']

non_imputed_cols = ['age_group', 'race', 'census_msa']

# Ordinal Encoding
# Sex - 0=Female | 1=Male
# Marital Status - 0=Married | 1=Not Married
# Rent or Own - 0=Own | 1=Rent

In [None]:
#Impute certain columns with ColumnTransformer
col_imputer = ColumnTransformer(transformers=[
    ("sim", SimpleImputer(strategy='most_frequent'), frequent_columns),
    ("sib", SimpleImputer(strategy='median'), median_columns)
    ],
    remainder="passthrough")

#OrdinalEncode and OneHotEncode certain columns with ColumnTransformer
col_oe_ohe = ColumnTransformer(transformers=[
    ('oe', OrdinalEncoder(categories='auto'), oe_cols),
    ("ohe", OneHotEncoder(categories="auto", drop='first'), ohe_cols)
    ], 
    remainder='passthrough')

# Create a pipeline containing the impute ColumnTransformer
impute_pipe = Pipeline(steps=[
    ('col_imputer', col_imputer)
])

#Fit and transform X_train through impute pipeline
imputed = impute_pipe.fit_transform(X_train)

#Create new dataframe with newly imputed data
X_train_pipe_impute = pd.DataFrame(imputed, columns=frequent_columns + median_columns + non_imputed_cols)


#Create a pipeline containing the encoding ColumnTransformer
encode_scale_pipe = Pipeline(steps=[
    ('col_oe_ohe', col_oe_ohe),
    ('ss', StandardScaler())
])

#Fit and transform imputed data through encode pipeline
transformed_data = encode_scale_pipe.fit_transform(X_train_pipe_impute)

#Isolate and create feature names of the OneHotEncoded features
encoder = col_oe_ohe.named_transformers_['ohe']
category_labels = encoder.get_feature_names(ohe_cols)

# Make a dataframe with the transformed data
X_train_pipe_processed = pd.DataFrame(transformed_data, columns=oe_cols + list(category_labels) + list(X_train_pipe_impute.drop(ohe_cols + oe_cols, axis=1).columns))
X_train_pipe_processed.head()

In [None]:
gbt_clf = GradientBoostingClassifier(random_state=42)

gbt_clf.fit(X_train_pipe_processed, y_train)
gbt_clf.score(X_train_pipe_processed, y_train)


In [None]:
# grid = {
#     'loss': ['deviance', 'exponential'],
#     'learning_rate': [.1, .01, .001],
#     'n_estimators': [10, 50, 100],
#     'max_depth': [None, 2, 3],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 3]
# }

# gs_tree = GridSearchCV(gbt_clf, grid, cv=3, return_train_score=True)
# gs_tree.fit(X_train_pipe_processed, np.ravel(y_train))


# gs_tree.best_params_

In [None]:
grid = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [.1, .01, .001],
    'n_estimators': [10, 50, 100],
    'min_samples_split': [2, 5, 10]
}


#'min_samples_leaf': [1, 2, 3]

gs_tree = GridSearchCV(gbt_clf, grid, cv=3, return_train_score=True)
gs_tree.fit(X_train_pipe_processed, np.ravel(y_train))

In [None]:
gs_tree.best_params_

gbt_clf = GradientBoostingClassifier(random_state=42, learning_rate=.1, loss='exponential', min_samples_split=5)

gbt_clf.fit(X_train_pipe_processed, np.ravel(y_train))
gbt_clf.score(X_train_pipe_processed, np.ravel(y_train))
y_pred = gbt_clf.predict(X_train_pipe_processed)

In [None]:
from sklearn.metrics import roc_auc_score
cross_val = cross_val_score(gbt_clf, X_train_pipe_processed, np.ravel(y_train), scoring='accuracy', cv=3)
cross_val.mean()

auc_score = roc_auc_score(y_train, y_pred)


In [None]:
auc_score

In [None]:
imputed_test = impute_pipe.transform(X_test)

#Create new dataframe with newly imputed data
X_test_pipe_impute = pd.DataFrame(imputed_test, columns=frequent_columns + median_columns + non_imputed_cols)

transformed_test_data = encode_scale_pipe.transform(X_test_pipe_impute)

X_test_pipe_processed = pd.DataFrame(transformed_test_data, columns=oe_cols + list(category_labels) + list(X_test_pipe_impute.drop(ohe_cols + oe_cols, axis=1).columns))



In [None]:
gb_cv_rfe = []
gb_keep_lists = []
max_features = 20
for n in range(1,max_features+1):
    num_features_to_select = n
    gb_rfe = GradientBoostingClassifier(random_state=42, learning_rate=.1, loss='exponential', min_samples_split=5)
    select = RFE(gb_rfe, n_features_to_select=num_features_to_select)
    select.fit(X=X_train_pipe_processed, y=y_train)
    feature_list = [(k,v) for k,v in zip(X_train_pipe_processed.columns,select.support_)]
    current_keep_list = []
    for k,v in feature_list:
        if v:
            current_keep_list.append(k)
    
    current_cv = cross_val_score(gb_rfe,X_train_pipe_processed[current_keep_list],y_train,cv=3,scoring='roc_auc').mean()

    gb_cv_rfe.append(current_cv)
    gb_keep_lists.append(current_keep_list)

In [None]:
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(range(1,max_features+1),gb_cv_rfe)
ax.set_xlabel('Number of Features')
ax.set_ylabel('Mean Cross Val ROC AUC Score for Decision Tree')
plt.show()

In [None]:
gb_final = GradientBoostingClassifier(random_state=42, learning_rate=.1, loss='exponential', min_samples_split=5)
gb_final.fit(X_train_pipe_processed[gb_keep_lists[19]], y_train)

In [None]:
roc_auc_score(y_test, gb_final.predict(X_test_pipe_processed[gb_keep_lists[19]]))

In [None]:
cross_val_score(gb_final,X_train_pipe_processed[gb_keep_lists[19]],y_train,cv=5,scoring='roc_auc').mean()