## %pylab inline

In [0]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import roc_curve, auc
import pylab as pl
import numpy as np

import warnings                              # Disable some warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


In [0]:
# Read the dataset as a Pandas dataframe in memory
dataset_wage = dataiku.Dataset("Risk")
df = dataset_wage.get_dataframe(limit=100000)

In [0]:
# Get some simple descriptive statistics
pdu.audit(df)

In [0]:
# Stepwise function
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.05, 
                       threshold_out = 0.1, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [0]:
# Fills X (removing variables with missing values)
selected_fields=df.drop(labels=["id","risk"],axis=1)
# Sets y
y = df['risk']

result = stepwise_selection(selected_fields, y)

In [0]:
print('resulting features:')
print(result)

In [0]:
# Adds selected features
X=df
for item in df.columns:
    if item not in result:
        X=X.drop(labels=[item],axis=1)
X = sm.add_constant(X)

# statsmodels (no regularization)
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit_regularized()
print(result.summary2())

# print Exp(B) and confusion matrix
params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['Lower', 'Upper', 'Odds Ratio']
print ("\nexp(B) & confidence intervals: ")
print (np.exp(conf))

print("\nConfusion matrix:")
print(result.pred_table())

In [0]:
train_cols = X.columns[1:]
result = sm.Logit(df['risk'], df[train_cols]).fit()
df['pred'] = result.predict(df[train_cols])
fpr, tpr, thresholds =roc_curve(df['risk'], df['pred'])
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)

i = np.arange(len(tpr)) # index for df
roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'cut-off' : pd.Series(thresholds, index=i)})
roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
print("Optimal cut-off for accuracy :")
print(list(roc_t['cut-off']))