# Load libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold
import seaborn as sns
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt



In [2]:
plt.style.use('ggplot')

%matplotlib inline

#  
# Load data

In [3]:
df = pd.read_csv('../../data/cs-training.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
df.rename(columns={df.columns[0]:'id'}, inplace=True)

In [6]:
df.head()

Unnamed: 0,id,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [7]:
len(df)

150000

In [8]:
df.mean(axis=0)

id                                      75000.500000
SeriousDlqin2yrs                            0.066840
RevolvingUtilizationOfUnsecuredLines        6.048438
age                                        52.295207
NumberOfTime30-59DaysPastDueNotWorse        0.421033
DebtRatio                                 353.005076
MonthlyIncome                            6670.221237
NumberOfOpenCreditLinesAndLoans             8.452760
NumberOfTimes90DaysLate                     0.265973
NumberRealEstateLoansOrLines                1.018240
NumberOfTime60-89DaysPastDueNotWorse        0.240387
NumberOfDependents                          0.757222
dtype: float64

In [9]:
df.rename(columns={'SeriousDlqin2yrs':'target'}, inplace=True)

features = [c for c in df.columns if c not in ['id','target']]

#  
# Replacing NANs

In [10]:
for c in df.columns:
    nan_count = len(df[df[c].isnull()])
    if nan_count>0:
        print(c, nan_count, 1.0*nan_count/len(df))

('MonthlyIncome', 29731, 0.19820666666666667)
('NumberOfDependents', 3924, 0.02616)


In [11]:
print(min(df['MonthlyIncome']))
print(min(df['NumberOfDependents']))

0.0
0.0


In [12]:
df = df.fillna(-1)

#  
# Generate stacked dataset

In [13]:
# BALANCE BEFORE!!
def generateStackedDataset(df,
                           features,
                           target,
                           models):
    
    kf = StratifiedKFold(y=df[target], 
                           n_folds=5, 
                           shuffle=True,
                           random_state=42)
    
    if list(df.index.values) != list(range(len(df))):
        print("ERROR WITH INDEX")
        return 0
  
    for train_index, test_index in kf:        
        # Train the model
        X_train = df.loc[train_index,features].values
        y_train = df.loc[train_index,target].values    
        X_test = df.loc[test_index,features].values
        y_test = df.loc[test_index,target].values  
              
        for i in range(len(models)):
            print('.'),
            model = models[i]
            model.fit(X_train, y_train)
            # Calculate the predictions            
            df.loc[test_index, 'proba_'+str(i)] = model.predict_proba(X_test)[:,1]
        print(" ")
    return df[[c for c in df.columns if 'proba_' in c]+[target]]

In [17]:
class_1_samples = len(df[df['target']==1])

In [18]:
datasets = []
#generating balanced datasers
for i in range(5):
    df_balanced = pd.concat([df[df['target']==0].sample(n=class_1_samples),
                             df[df['target']==1]]).reset_index(drop=True) 
    datasets.append(df_balanced)
df_big_balanced = pd.concat(datasets).reset_index(drop=True)

In [19]:
df_big_balanced.head()

Unnamed: 0,id,target,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,83921,0,0.044553,39,0,0.150207,13800.0,3,0,1,0,0.0
1,91909,0,0.0,63,0,0.790735,625.0,3,0,0,0,0.0
2,117440,0,0.612729,57,1,3.742053,1100.0,8,0,2,0,1.0
3,79498,0,0.076883,59,1,2771.0,-1.0,12,0,2,0,0.0
4,117331,0,0.067656,62,0,0.012427,7000.0,11,0,0,0,0.0


In [20]:
len(df_big_balanced)

100260

In [25]:
class_1_samples = len(df[df['target']==1])
models = [GradientBoostingClassifier(n_estimators=10),
          GradientBoostingClassifier(n_estimators=15),
#           GradientBoostingClassifier(n_estimators=200),
#           RandomForestClassifier(n_estimators=100),
#           RandomForestClassifier(n_estimators=150),
#           RandomForestClassifier(n_estimators=200),
#           AdaBoostClassifier(n_estimators=100),
#           AdaBoostClassifier(n_estimators=150),
          AdaBoostClassifier(n_estimators=20),
          LogisticRegression()]




In [21]:
features = [c for c in df_big_balanced.columns if c not in ['target','id']]

In [22]:
features

['RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']

In [26]:
df_big_balanced_stacked = generateStackedDataset(df_big_balanced,
                                                   features,
                                                   'target',
                                                   models)

. . . .  
. . . .  
. . . .  
. . . .  
. . . .  


In [27]:
df_big_balanced_stacked.head()

Unnamed: 0,proba_0,proba_1,proba_2,proba_3,target
0,0.288804,0.240561,0.4746,0.405839,0
1,0.288804,0.231812,0.476354,0.288371,0
2,0.634817,0.633504,0.507757,0.504463,0
3,0.452591,0.427242,0.488166,0.442417,0
4,0.290765,0.233774,0.472664,0.272435,0


# Evaluate

In [28]:
features = [c for c in df_big_balanced_stacked.columns if c!='target']
print(features)

['proba_0', 'proba_1', 'proba_2', 'proba_3']


In [29]:
scores = cross_val_score(LogisticRegression(), 
                         #RandomForestClassifier(), 
                        df_big_balanced[features], 
                        df_big_balanced['target'], 
                        scoring='roc_auc', 
                        cv=10)

In [30]:
np.mean(scores),np.std(scores)

(0.85889042369750879, 0.0021752553487585429)

# Fit the models

In [36]:
for model in models:
    model.fit(df_big_balanced[[c for c in df_big_balanced.columns if c not in ['id','target']]], 
              df_big_balanced['target'])

In [39]:
model_final = LogisticRegression()
model_final.fit(df_big_balanced_stacked[[c for c in df_big_balanced_stacked.columns if c!='target']],
                df_big_balanced_stacked['target'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#  
#  
# Generate the solution

In [40]:
df_toPredict = pd.read_csv('../../data/cs-test.csv')
df_toPredict = df_toPredict.fillna(-1)

In [45]:
features = ['RevolvingUtilizationOfUnsecuredLines',
         'age',
         'NumberOfTime30-59DaysPastDueNotWorse',
         'DebtRatio',
         'MonthlyIncome',
         'NumberOfOpenCreditLinesAndLoans',
         'NumberOfTimes90DaysLate',
         'NumberRealEstateLoansOrLines',
         'NumberOfTime60-89DaysPastDueNotWorse',
         'NumberOfDependents']

In [46]:
for i in range(len(models)):
    df_toPredict.loc[test_index, 'proba_'+str(i)] = model.predict_proba(df_toPredict[features])[:,1]
#return df_toPredict[[c for c in df.columns if 'proba_' in c]+[target]]

ValueError: X has 10 features per sample; expecting 14

### Generate 10 datasets using sampling where class are balanced  and use the mean of the 10 predictions

In [66]:
df_target1 = df[df['target']==1].copy().reset_index(drop=True)
df_target0 = df[df['target']==0].copy()

In [67]:
#dfs = []
for i in range(10):
    print(i) #just added
    df_balanced = pd.concat([df_target1,
                             df_target0.sample(n=len(df_target1))]).reset_index(drop=True)
    clf.fit(df_balanced[features],
           df_balanced['target'])
    df_toPredict['Probability'+str(i)] = clf.predict_proba(df_toPredict[features])[:,1]
    

0
1
2
3
4
5
6
7
8
9


In [68]:
df_toPredict.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,...,Probability0,Probability1,Probability2,Probability3,Probability4,Probability5,Probability6,Probability7,Probability8,Probability9
0,1,-1.0,0.885519,43,0,0.177513,5700.0,4,0,0,...,0.497515,0.484593,0.51476,0.512758,0.468555,0.541621,0.516301,0.505471,0.492174,0.538217
1,2,-1.0,0.463295,57,0,0.527237,9141.0,15,0,4,...,0.47928,0.454399,0.357787,0.423093,0.454664,0.374319,0.513709,0.374143,0.455867,0.406493
2,3,-1.0,0.043275,59,0,0.687648,5083.0,12,0,1,...,0.134342,0.147749,0.156626,0.153177,0.136374,0.141616,0.145873,0.172736,0.173187,0.141362
3,4,-1.0,0.280308,38,1,0.925961,3200.0,7,0,2,...,0.554288,0.541085,0.561594,0.566923,0.565656,0.555151,0.557026,0.55487,0.527672,0.555432
4,5,-1.0,1.0,27,0,0.019917,3865.0,4,0,0,...,0.624148,0.585709,0.614193,0.641473,0.594752,0.617336,0.680067,0.624035,0.609328,0.572535


In [69]:
df_toPredict['Probability'] = df_toPredict[['Probability'+str(i) for i in range(10)]].mean(axis=1)

In [70]:
df_toPredict.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,...,Probability1,Probability2,Probability3,Probability4,Probability5,Probability6,Probability7,Probability8,Probability9,Probability
0,1,-1.0,0.885519,43,0,0.177513,5700.0,4,0,0,...,0.484593,0.51476,0.512758,0.468555,0.541621,0.516301,0.505471,0.492174,0.538217,0.507196
1,2,-1.0,0.463295,57,0,0.527237,9141.0,15,0,4,...,0.454399,0.357787,0.423093,0.454664,0.374319,0.513709,0.374143,0.455867,0.406493,0.429375
2,3,-1.0,0.043275,59,0,0.687648,5083.0,12,0,1,...,0.147749,0.156626,0.153177,0.136374,0.141616,0.145873,0.172736,0.173187,0.141362,0.150304
3,4,-1.0,0.280308,38,1,0.925961,3200.0,7,0,2,...,0.541085,0.561594,0.566923,0.565656,0.555151,0.557026,0.55487,0.527672,0.555432,0.55397
4,5,-1.0,1.0,27,0,0.019917,3865.0,4,0,0,...,0.585709,0.614193,0.641473,0.594752,0.617336,0.680067,0.624035,0.609328,0.572535,0.616358


In [71]:
df_toPredict.rename(inplace=True, columns={df_toPredict.columns[0]:'Id'})

In [72]:
df_toPredict[['Id','Probability']].to_csv('predictions.csv',index=False)