# Exploring Multicollinearity of Input Variables

The below notebook shows that the most significant input variables for predicting target variable are all highly correlated with each other.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix,plot_confusion_matrix, roc_auc_score, plot_roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Data Cleaning & Exploration

In [2]:
df_var = pd.read_csv('data/training_set_features.csv')
df_tar = pd.read_csv('data/training_set_labels.csv')['seasonal_vaccine']
df_var = df_var.drop(['respondent_id','h1n1_concern','h1n1_knowledge','opinion_h1n1_vacc_effective','opinion_h1n1_risk','opinion_h1n1_sick_from_vacc','doctor_recc_h1n1','hhs_geo_region'],axis=1)
df_var = df_var.drop(['health_insurance','income_poverty','employment_industry','employment_occupation'],axis=1)

X_train,X_test,y_train,y_test = train_test_split(df_var,df_tar,random_state=42)
X_train = X_train.copy(deep=True)

### Preprocessing Pipeline

In [3]:
majority_columns = ['behavioral_antiviral_meds', 'behavioral_avoidance',
                     'behavioral_face_mask', 'behavioral_wash_hands',
                     'behavioral_large_gatherings', 'behavioral_outside_home',
                     'behavioral_touch_face', 'doctor_recc_seasonal',
                     'chronic_med_condition', 'child_under_6_months', 'health_worker',
                     'education', 'rent_or_own', 'marital_status', 'employment_status',
                     'sex'
                  ]
opinion_columns = ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                   'opinion_seas_sick_from_vacc','household_adults', 'household_children'
                   ]
ohe_columns = ['opinion_seas_vacc_effective', 'opinion_seas_risk',
                   'opinion_seas_sick_from_vacc','age_group','education','race',
                   'employment_status', 'census_msa'
                   ]
non_imputed_columns = ['age_group','race','census_msa']
oe_columns = ['sex','marital_status','rent_or_own']


col_imputer = ColumnTransformer(transformers=[
    ("sim", SimpleImputer(strategy='most_frequent'), majority_columns),

    ("sib", SimpleImputer(strategy='median'), opinion_columns)

    ],
    remainder="passthrough")

col_ohe = ColumnTransformer(transformers=[
    ('oe' , OrdinalEncoder(categories='auto'), oe_columns),
    ("ohe", OneHotEncoder(categories="auto", drop='first'), ohe_columns)
    ], 
    remainder='passthrough')

# Create a pipeline containing the single column transformer
pipe1 = Pipeline(steps=[
    ('col_imputer', col_imputer)
])

imputed = pipe1.fit_transform(X_train)
X_train_pipe_impute = pd.DataFrame(imputed, columns=majority_columns+opinion_columns+non_imputed_columns)

pipe2 = Pipeline(steps=[
    ('col_ohe', col_ohe)
])

# Use the pipeline to fit and transform the data
transformed_data = pipe2.fit_transform(X_train_pipe_impute)

encoder = col_ohe.named_transformers_['ohe']
category_labels = encoder.get_feature_names(ohe_columns)

# Make a dataframe with the relevant columns
X_train_pipe_processed = pd.DataFrame(transformed_data, columns=oe_columns+list(category_labels)+list(X_train_pipe_impute.drop(ohe_columns+oe_columns, axis=1).columns))

# Sex - 0=Female | 1=Male
# Marital Status - 0=Married | 1=Not Married
# Rent or Own - 0=Own | 1=Rent

## Feature Selection

### Recursive Feature Elimination for Logisitic Regression Model

In [5]:
# Recursive Feature Elimination

lr_cv_rfe = []
lr_keep_lists = []
max_features = 5
for n in range(1,max_features+1):
    num_features_to_select = n
    lr_rfe = LogisticRegression(penalty='l1',random_state=42,solver='saga',C = 0.08164183673469387)
    select = RFE(lr_rfe, n_features_to_select=num_features_to_select)
    select.fit(X=X_train_pipe_processed, y=y_train)
    feature_list = [(k,v) for k,v in zip(X_train_pipe_processed.columns,select.support_)]
    current_keep_list = []
    for k,v in feature_list:
        if v:
            current_keep_list.append(k)
    
    current_cv = cross_val_score(lr_rfe,X_train_pipe_processed[current_keep_list],y_train,cv=3,scoring='roc_auc').mean()

    lr_cv_rfe.append(current_cv)
    lr_keep_lists.append(current_keep_list)

In [20]:
# Final Logreg Model Mean Cross Val AUC ROC Score

logreg_final = LogisticRegression(penalty='l1',random_state=42,solver='saga',C = 0.08164183673469387)
logreg_final.fit(X_train_pipe_processed[lr_keep_lists[0]],y_train)
cross_val_score(lr_rfe,X_train_pipe_processed[lr_keep_lists[0]],y_train,cv=5,scoring='roc_auc').mean()

0.6663615671233896

In [10]:
p_value_list = []
column_tracker_list = []
for column1 in X_train_pipe_processed.columns:
    for column2 in X_train_pipe_processed.columns:
        if column1 != column2:
            ct = pd.crosstab(X_train_pipe_processed[column1], 
                             X_train_pipe_processed[column2], margins=True)

            obs = np.array([ct.iloc[i][0:-1].values for i in range(X_train_pipe_processed[column1].value_counts().shape[0])])
            if set([column1,column2]) not in column_tracker_list:
                if stats.chi2_contingency(obs)[1] < 0.05:
                    column_tracker_list.append(set([column1,column2]))
                    p_value_list.append((column1,column2,stats.chi2_contingency(obs)[1]))

In [11]:
final_p = []
sorted_p = sorted(list(p_value_list),key=lambda x: x[2],reverse=True)
for i,val in enumerate(sorted_p):
    if i % 2:
        final_p.append(val)

In [13]:
set(lr_keep_lists[1]) in column_tracker_list 
# set(['opinion_seas_vacc_effective_5.0','opinion_seas_risk_4.0']) in column_tracker_list

True