In [10]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

In [5]:
creditCardData = pd.read_csv('data/AER_credit_card_data.csv', true_values=['yes'], false_values=['no'])
creditCardData.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,True,0,37.66667,4.52,0.03327,124.9833,True,False,3,54,1,12
1,True,0,33.25,2.42,0.005217,9.854167,False,False,3,34,1,13
2,True,0,33.66667,4.5,0.004156,15.0,True,False,4,58,1,5
3,True,0,30.5,2.54,0.065214,137.8692,False,False,0,25,1,7
4,True,0,32.16667,9.7867,0.067051,546.5033,True,False,2,64,1,5


In [11]:
X = creditCardData.drop(['card'], axis=1)
y = creditCardData.card

# Calling Imputer inside make_pipeline without splitting the data may lead to data leakage! 
modelPipeline = make_pipeline(Imputer(), RandomForestClassifier())
cvScore = cross_val_score(modelPipeline, X, y, scoring='accuracy')
print("CV Score: ", cvScore.mean())

('CV Score: ', 0.9802857734520605)


In [12]:
'''
Everyone with card == False had no expenditures, while only 2% of those with card == True had no expenditures.
It's not surprising that our model appeared to have a high accuracy. But this seems a data leak, 
where expenditures probably means *expenditures on the card they applied for.

Since share is partially determined by expenditure, it should be excluded too. 
The variables active, majorcards are a little less clear, but from the description, they sound concerning. 
In most situations, 
it's better to be safe than sorry if you can't track down the people who created the data to find out more.
'''

potentialPredictorLeaks = ['expenditure', 'active', 'majorcards', 'share']
X_noleaks = X.drop(potentialPredictorLeaks, axis=1)

cvScore_noleaks = cross_val_score(modelPipeline, X_noleaks, y, scoring='accuracy')
print("No-leaks CV Score: ", cvScore_noleaks.mean())

('No-leaks CV Score: ', 0.7960619866086837)
