In [None]:
import numpy as np
import pandas  as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import category_encoders as ce
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df['type'] = 'train'
test_df['type'] = 'test'
all_data = pd.concat([train_df, test_df],axis = 0)
print(len(train_df))
print(len(test_df))
print(len(all_data))


In [None]:
all_data.tail(10)

In [None]:
train_df.columns

# Analysis and Missing Value Imputations

In [None]:
num_vars = []
cat_vars = []
for col in train_df.columns:
    if train_df[col].dtypes == 'O':
        cat_vars.append(col)
    else:
        num_vars.append(col)
print(len(num_vars))
print(len(cat_vars))

In [None]:
num_vars.remove('id')
num_vars.remove('target')
num_vars

In [None]:
for col in all_data.columns:
    print(col, 'has', all_data[col].isnull().sum(), 'null values')

In [None]:
bin_vars = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
ord_vars = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
nom_vars = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

In [None]:
for col in bin_vars:
    print(train_df[col].value_counts())

In [None]:
all_data[['bin_0', 'bin_1', 'bin_2']] = all_data[['bin_0', 'bin_1', 'bin_2']].fillna(0.0)
all_data['bin_3'].fillna('F', inplace = True)
all_data['bin_4'].fillna('N', inplace = True)
for col in all_data.columns:
    print(col, 'has', all_data[col].isnull().sum(), 'null values')

In [None]:
for col in ord_vars:
    print(train_df[col].value_counts())

In [None]:
all_data[ord_vars] = all_data[ord_vars].replace(np.nan, 'missing', regex = True)
for col in all_data.columns:
    print(col, 'has', all_data[col].isnull().sum(), 'null values')

In [None]:
for col in nom_vars:
    print(train_df[col].value_counts())

In [None]:
all_data['nom_0'].fillna('Red', inplace = True)
all_data['nom_4'].fillna("Theremin", inplace = True)
all_data[['nom_1', 'nom_2', 'nom_3', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']] = all_data[['nom_1', 'nom_2', 'nom_3', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].fillna('missing')
for col in all_data.columns:
    print(col, 'has', all_data[col].isnull().sum(), 'null values')

In [None]:
all_data[['day', 'month']] = all_data[['day', 'month']].fillna(0.0)
for col in all_data.columns:
    print(col, 'has', all_data[col].isnull().sum(), 'null values')

# Handling Categorical Variables

In [None]:
all_data[bin_vars]

In [None]:
all_data['bin_3'] = [0 if i == 'F' else 1 for i in all_data['bin_3']]
all_data['bin_4'] = [0 if i == 'N' else 1 for i in all_data['bin_4']]
all_data[bin_vars]

In [None]:
lbl_enc = LabelEncoder()
for col in ord_vars:
    all_data[col] = lbl_enc.fit_transform(all_data[col].astype('str').values)

In [None]:
all_data.head()

In [None]:
low_card_nom_vars = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
high_card_nom_vars = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']


In [None]:
nom_var_dummies = pd.get_dummies(all_data[low_card_nom_vars], drop_first = True)
new_data = pd.concat([all_data, nom_var_dummies], axis = 1)
print(all_data.shape)
print(new_data.shape)
new_data.drop(low_card_nom_vars, axis = 1, inplace = True)
print(new_data.shape)

In [None]:
new_data.columns

In [None]:
hashEncoder = ce.HashingEncoder(cols = high_card_nom_vars)
new_data = hashEncoder.fit_transform(new_data)

In [None]:
new_data.shape

In [None]:
train_set = new_data[new_data['type'] == 'train']
train_target = train_set['target']
train_set.drop(['id', 'target', 'type'], axis = 1, inplace = True)
test_set = new_data[new_data['type'] == 'test']
test_set.drop('type', axis = 1, inplace = True)
print(train_set.shape)
print(test_set.shape)

In [None]:
pd.set_option('display.max_columns', None)
new_data.describe()

In [None]:
stds = new_data.describe().loc['std'].tolist()
variances = [i**2 for i in stds]
variances

In [None]:
plt.figure(figsize = (15,8))
sns.distplot(variances, bins = 50)

In [None]:
train_set.columns

# Variance Threshold checks the feature variances and removes which are below threshold

In [None]:
var_th = VarianceThreshold(threshold = 0)
var_th.fit_transform(train_set)

In [None]:
var_th

In [None]:
train_set

In [None]:
var_th.get_support()

In [None]:
constant_columns = [col for col in train_set.columns if col not in train_set.columns[var_th.get_support()]]
constant_columns

In [None]:
train_set.columns[var_th.get_support()]

# Removal of Quasi-Constant features

Quasi - Constant features are features which are almost constant. These features have same value for a very large subset of values in a particular feature.  There is no rule as to what should be the threshold for the variance of quasi-constant features. However, as a rule of thumb, remove those quasi-constant features that have more than 99% similar values for the output observations.

In [None]:
quasi_const = VarianceThreshold(threshold=0.01)
quasi_const.fit(train_set)
quasi_const.get_support()

# Duplicate Variables

We will remove duplicate variables which have the sae value as they do not contribute to the model instead act as a noise and hinder model performance. We do not have a method to remove duplicate columns but we have pd.duplicated() which removes duplicate rows. We can transpose the df and use this method. However, this is computationally very expensive.

In [None]:
train_set_T = train_set.T
train_set_T.shape

In [None]:
# train_set_T = train_set_T[~train_set_T.duplicated(keep = 'first')]

In [None]:
corr_matrix = train_set.corr()
plt.figure(figsize = (15,8))
sns.heatmap(corr_matrix)

# Get features with correlation greater than threshold

In [None]:
corr_matrix

# Rather than any method, just try to figure out manually, where the correlation is greater than threshold(to be decided manually). For any two features found, check for the one with lower variance and eliminate that one.
    

# Selecting features on the basis of ROC- AUC Curve

Features with roc_auc score > 0.5 are considered as good and those with the score less than 0.5 can be discarded

In [None]:
roc_auc = []
rfc = RandomForestClassifier(class_weight='balanced')
for col in train_set.columns:
    rfc.fit(train_set[[col]], train_target)
    y_pred = rfc.predict(train_set[[col]])
    roc_auc.append(roc_auc_score(train_target, y_pred))

In [None]:
# In the result, all are > 0.5, so we won't remove any feature
roc_auc

In [None]:
# Now we will make a copy of datasts because we need to scale the features for modelling, and random forest, decision tree
# does not require any scaling
train_set_copy = train_set.copy()
test_set_copy = test_set.copy()

In [None]:
std_sclr = StandardScaler()
train_set_copy = std_sclr.fit_transform(train_set_copy)
test_set_copy = std_sclr.fit_transform(test_set_copy)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_set_copy, train_target, test_size = 0.2, random_state = 45)

In [None]:
X_train

In [None]:
# lets create a dataframe to save model's performance
model_df = pd.DataFrame(columns = ["model_name","training roc_auc","test_roc_auc"])

In [None]:
gc.collect()

In [None]:
# GLM Model
X_train_sm = sm.add_constant(X_train)
logm1 = sm.GLM(y_train, X_train_sm, class_weight = 'balanced', family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
cols = X_train.columns

In [None]:
cols = cols.drop('bin_3')
cols

In [None]:
X_train_sm = sm.add_constant(X_train[cols])
logm2 = sm.GLM(y_train, X_train_sm, class_weight = 'balanced', family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
cols = cols.drop('nom_1_missing')
cols

In [None]:
import gc
gc.collect()

In [None]:
X_train_sm = sm.add_constant(X_train[cols])
logm3 = sm.GLM(y_train, X_train_sm, class_weight = 'balanced', family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = cols
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(len(cols))]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values('VIF', ascending = False)
vif

In [None]:
# since nom_3_india has a high vif, we will remove this column
cols = cols.drop(['nom_3_India', 'nom_3_Costa Rica'])
vif = pd.DataFrame()
vif['Features'] = cols
vif['VIF'] = [variance_inflation_factor(X_train[cols].values, i) for i in range(len(cols))]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values('VIF', ascending = False)
vif

# Logistic Regression

In [None]:
X_train = X_train[cols]
X_test = X_test[cols]

lgr = LogisticRegression(class_weight='balanced')
lgr.fit(X_train, y_train)

In [None]:
X_train

In [None]:
# make predictions
