In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit

In [2]:
df = pd.read_csv('data/DefaultCreditcardClients.csv')
df.rename(columns={'default payment next month':'default'}, inplace=True)
df.index = df.ID

In [3]:
# 1. Remove attributes that just arent useful for us
del df['ID']

In [4]:
#Create Lists for Analysis
continuous_features = ['LIMIT_BAL', 'BILL_AMT1', 'BILL_AMT2','BILL_AMT3',
                       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
                       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                       'PAY_AMT6']
ordinal_features = ['EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6','default']

In [5]:
#Convert datatypes
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)

In [6]:
#Transformations

#convert any non-identified education categories to 'OTHER'
df['EDUCATION'] = df['EDUCATION'].replace(to_replace=(0,5,6),value=4)

#convert any non-identified marriage categories to 'OTHER'
df['MARRIAGE'] = df['MARRIAGE'].replace(to_replace=(0),value=3)

#Log transform continuous variables; as they each have a mostly exponential distribution
df["log_LIMIT_BAL"]=np.log(df.LIMIT_BAL)
df["log_PAY_AMT1"]=np.log(df.PAY_AMT1+1)
df["log_PAY_AMT2"]=np.log(df.PAY_AMT2+1)
df["log_PAY_AMT3"]=np.log(df.PAY_AMT3+1)
df["log_PAY_AMT4"]=np.log(df.PAY_AMT4+1)
df["log_PAY_AMT5"]=np.log(df.PAY_AMT5+1)
df["log_PAY_AMT6"]=np.log(df.PAY_AMT6+1)

In [7]:
#as determined in prio lab analysis
#high correlation between BILL Amount and Pay amount so ignire bill amount for now
dfsub = df

In [8]:
del dfsub['LIMIT_BAL']
del dfsub['BILL_AMT1']
del dfsub['BILL_AMT2']
del dfsub['BILL_AMT3']
del dfsub['BILL_AMT4']
del dfsub['BILL_AMT5']
del dfsub['BILL_AMT6']
del dfsub['PAY_AMT1']
del dfsub['PAY_AMT2']
del dfsub['PAY_AMT3']
del dfsub['PAY_AMT4']
del dfsub['PAY_AMT5']
del dfsub['PAY_AMT6']

In [9]:
df[['EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6','default']].describe()

Unnamed: 0,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,1.842267,1.557267,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,-0.2662,-0.2911,0.2212
std,0.744494,0.521405,9.217904,1.123802,1.197186,1.196868,1.169139,1.133187,1.149988,0.415062
min,1.0,1.0,21.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,0.0
25%,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
50%,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,2.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4.0,3.0,79.0,8.0,8.0,8.0,8.0,8.0,8.0,1.0


In [12]:
# we want to predict the X and y data as follows:
from sklearn.model_selection import ShuffleSplit
y = df['default'].values # get the labels we want
X = dfsub.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    # have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 10
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,test_size  = 0.2)          
print(cv_object)

ShuffleSplit(n_splits=10, random_state=None, test_size=0.2, train_size=None)


In [13]:
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
lr_clf = LogisticRegression(penalty='12', C=1.0, class_weight=None)
iter_num=0

for train_indices, test_indices in cv_object:
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    lr_clf.fit(X_train,y_train)
    y_hat = lr_clf.predict(X_test)
    
    acc=mt.accuracy_score(y_test,y_hat)
    conf= mt.confusion_matrix(y_test,y_hat)
    print("====iteration ",iter_num,"=====")
    print("accuracy",acc)
    print("confusion matrix \n",conf)
    iter_num+=1

TypeError: 'ShuffleSplit' object is not iterable

In [None]:
>>> from sklearn.model_selection import ShuffleSplit
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
>>> y = np.array([1, 2, 1, 2, 1, 2])
>>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
>>> rs.get_n_splits(X)