In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
import seaborn as sns
from collections import Counter

import warnings
warnings.filterwarnings('ignore')


sns.set(style='white', context='notebook', palette='deep')
pd.options.display.max_columns = 200

In [2]:
ktrain = pd.read_csv("train.csv")
ktest = pd.read_csv("test.csv")
ktrain.isnull().sum() #sum of null values 

Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [3]:
ktest.isnull().sum()

Unnamed: 0                                   0
SeriousDlqin2yrs                        101503
RevolvingUtilizationOfUnsecuredLines         0
age                                          0
NumberOfTime30-59DaysPastDueNotWorse         0
DebtRatio                                    0
MonthlyIncome                            20103
NumberOfOpenCreditLinesAndLoans              0
NumberOfTimes90DaysLate                      0
NumberRealEstateLoansOrLines                 0
NumberOfTime60-89DaysPastDueNotWorse         0
NumberOfDependents                        2626
dtype: int64

In [4]:
def detect_outliers(df,n,features):
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

# detect outliers from Age, SibSp , Parch and Fare
# These are the numerical features present in the dataset
Outliers_to_drop = detect_outliers(ktrain,2,["RevolvingUtilizationOfUnsecuredLines",
                                            "age",
                                            "NumberOfTime30-59DaysPastDueNotWorse",
                                            "DebtRatio",
                                            "MonthlyIncome",
                                            "NumberOfOpenCreditLinesAndLoans",
                                            "NumberOfTimes90DaysLate",
                                            "NumberRealEstateLoansOrLines",
                                            "NumberOfTime60-89DaysPastDueNotWorse",
                                            "Unnamed: 0",
                                            "NumberOfDependents"])

In [5]:
ktrain.loc[Outliers_to_drop]

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
900,901,0,2.258964,33,2,0.032484,2000.0,1,1,0,0,2.0
919,920,1,1.362142,73,1,274.000000,,2,3,0,1,
1958,1959,0,1.634551,28,0,151.000000,,1,3,0,2,0.0
4852,4853,0,1.566866,46,2,0.104983,6000.0,3,2,0,0,0.0
5956,5957,0,2.237856,60,6,2597.000000,1.0,16,1,2,1,4.0
6251,6252,1,1.481038,26,0,22.000000,,1,0,0,1,0.0
6703,6704,0,1.421927,50,1,0.058003,4085.0,2,0,0,1,2.0
6907,6908,0,1.548094,24,0,37.000000,,3,0,0,2,
7079,7080,0,1.485050,66,1,1049.000000,,3,0,0,0,0.0
7352,7353,1,1.650560,35,1,1596.000000,,13,0,0,0,2.0


In [6]:
ktrain = ktrain.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

In [7]:
ktrain_len = len(ktrain)
ds = pd.concat(objs=[ktrain, ktest], axis=0).reset_index(drop=True)

In [8]:
ds = ds.rename(columns={'Unnamed: 0': 'Unknown',
                                  'SeriousDlqin2yrs': 'Target',
                                  'RevolvingUtilizationOfUnsecuredLines': 'UnsecLines',
                                  'NumberOfTime30-59DaysPastDueNotWorse': 'Late3059',
                                  'DebtRatio': 'DebtRatio',
                                  'MonthlyIncome': 'MonthlyIncome',
                                  'NumberOfOpenCreditLinesAndLoans': 'OpenCredit',
                                  'NumberOfTimes90DaysLate': 'Late90',
                                  'NumberRealEstateLoansOrLines': 'PropLines',
                                  'NumberOfTime60-89DaysPastDueNotWorse': 'Late6089',
                                  'NumberOfDependents': 'Deps'})

ktrain = ktrain.rename(columns={'Unnamed: 0': 'Unknown',
                                  'SeriousDlqin2yrs': 'Target',
                                  'RevolvingUtilizationOfUnsecuredLines': 'UnsecLines',
                                  'NumberOfTime30-59DaysPastDueNotWorse': 'Late3059',
                                  'DebtRatio': 'DebtRatio',
                                  'MonthlyIncome': 'MonthlyIncome',
                                  'NumberOfOpenCreditLinesAndLoans': 'OpenCredit',
                                  'NumberOfTimes90DaysLate': 'Late90',
                                  'NumberRealEstateLoansOrLines': 'PropLines',
                                  'NumberOfTime60-89DaysPastDueNotWorse': 'Late6089',
                                  'NumberOfDependents': 'Deps'})

ktest = ktest.rename(columns={'Unnamed: 0': 'Unknown',
                                  'SeriousDlqin2yrs': 'Target',
                                  'RevolvingUtilizationOfUnsecuredLines': 'UnsecLines',
                                  'NumberOfTime30-59DaysPastDueNotWorse': 'Late3059',
                                  'DebtRatio': 'DebtRatio',
                                  'MonthlyIncome': 'MonthlyIncome',
                                  'NumberOfOpenCreditLinesAndLoans': 'OpenCredit',
                                  'NumberOfTimes90DaysLate': 'Late90',
                                  'NumberRealEstateLoansOrLines': 'PropLines',
                                  'NumberOfTime60-89DaysPastDueNotWorse': 'Late6089',
                                  'NumberOfDependents': 'Deps'})

In [9]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247976 entries, 0 to 247975
Data columns (total 12 columns):
Unknown          247976 non-null int64
Target           146473 non-null float64
UnsecLines       247976 non-null float64
age              247976 non-null int64
Late3059         247976 non-null int64
DebtRatio        247976 non-null float64
MonthlyIncome    199624 non-null float64
OpenCredit       247976 non-null int64
Late90           247976 non-null int64
PropLines        247976 non-null int64
Late6089         247976 non-null int64
Deps             241584 non-null float64
dtypes: float64(5), int64(7)
memory usage: 22.7 MB


In [10]:
ds.head()

Unnamed: 0,Unknown,Target,UnsecLines,age,Late3059,DebtRatio,MonthlyIncome,OpenCredit,Late90,PropLines,Late6089,Deps
0,1,1.0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0.0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0.0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0.0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0.0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [11]:
#Unseclines 
ds.UnsecLines = pd.qcut(ds.UnsecLines.values, 5).codes

In [12]:
#Debt Ratio
ds.DebtRatio = pd.qcut(ds.DebtRatio.values, 5).codes

In [13]:
#Monthly Income - Filling NaN values with median
ds.MonthlyIncome = ds.MonthlyIncome.fillna(ds.MonthlyIncome.median())
ds.MonthlyIncome = pd.qcut(ds.MonthlyIncome.values, 5).codes

In [14]:
#age
ds.age = pd.qcut(ds.age.values, 5).codes

In [15]:
#open credit
ds.OpenCredit = pd.qcut(ds.OpenCredit.values, 5).codes

In [16]:
#Late 30-59 days #goruping customers 6 or more together(high sd)
for i in range(len(ds)):
    if ds.Late3059[i] >= 6:
        ds.Late3059[i] = 6

In [17]:
#Late 90 days  #goruping customers late>90 5 or more together(high sd)
for i in range(len(ds)):
    if ds.Late90[i] >= 5:
        ds.Late90[i] = 5

In [18]:
#Late 60-89 days  #goruping customers late btw 60 & 89 days 3 or more together(high sd)
for i in range(len(ds)):
    if ds.Late6089[i] >= 3:
        ds.Late6089[i] = 3

In [19]:
# Proper Lines of Income #goruping customers proper lines 6 or more together(high sd)
for i in range(len(ds)):
    if ds.PropLines[i] >= 6:
        ds.PropLines[i] = 6

In [20]:
# Total number of dependents #goruping customers with dependents 4 or more together(high sd)
ds.Deps = ds.Deps.fillna(ds.Deps.median())
for i in range(len(ds)):
    if ds.Deps[i] >= 4:
        ds.Deps[i] = 4

In [21]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247976 entries, 0 to 247975
Data columns (total 12 columns):
Unknown          247976 non-null int64
Target           146473 non-null float64
UnsecLines       247976 non-null int8
age              247976 non-null int8
Late3059         247976 non-null int64
DebtRatio        247976 non-null int8
MonthlyIncome    247976 non-null int8
OpenCredit       247976 non-null int8
Late90           247976 non-null int64
PropLines        247976 non-null int64
Late6089         247976 non-null int64
Deps             247976 non-null float64
dtypes: float64(2), int64(5), int8(5)
memory usage: 14.4 MB


In [22]:
ds.shape

(247976, 12)

In [23]:
ds = pd.get_dummies(ds, columns = ["UnsecLines"], prefix="UnsecLines")
ds = pd.get_dummies(ds, columns = ["age"], prefix="age")
ds = pd.get_dummies(ds, columns = ["Late3059"], prefix="Late3059")
ds = pd.get_dummies(ds, columns = ["DebtRatio"], prefix="DebtRatio")
ds = pd.get_dummies(ds, columns = ["MonthlyIncome"], prefix="MonthlyIncome")
ds = pd.get_dummies(ds, columns = ["OpenCredit"], prefix="OpenCredit")
ds = pd.get_dummies(ds, columns = ["Late90"], prefix="Late90")
ds = pd.get_dummies(ds, columns = ["PropLines"], prefix="PropLines")
ds = pd.get_dummies(ds, columns = ["Late6089"], prefix="Late6089")
ds = pd.get_dummies(ds, columns = ["Deps"], prefix="Deps")

In [24]:
ds.shape

(247976, 56)

In [75]:
ktrain = ds[:ktrain_len]
ktest = ds[ktrain_len:]
ktest.drop(labels=["Target"],axis = 1,inplace=True)

In [76]:
ktest.shape

(101503, 55)

In [77]:
ktest.shape

(101503, 55)

In [78]:
#Separating features and labels

ktrain["Target"] = ktrain["Target"].astype(int)

Y_train = ktrain["Target"]

X_train = ktrain.drop(labels = ["Target", "Unknown"],axis = 1)

In [79]:
Y_train.head()

0    1
1    0
2    0
3    0
4    0
Name: Target, dtype: int64

In [90]:
X_train.head()

Unnamed: 0,index,UnsecLines_0,UnsecLines_1,UnsecLines_2,UnsecLines_3,UnsecLines_4,age_0,age_1,age_2,age_3,age_4,Late3059_0,Late3059_1,Late3059_2,Late3059_3,Late3059_4,Late3059_5,Late3059_6,DebtRatio_0,DebtRatio_1,DebtRatio_2,DebtRatio_3,DebtRatio_4,MonthlyIncome_0,MonthlyIncome_1,MonthlyIncome_2,MonthlyIncome_3,MonthlyIncome_4,OpenCredit_0,OpenCredit_1,OpenCredit_2,OpenCredit_3,OpenCredit_4,Late90_0,Late90_1,Late90_2,Late90_3,Late90_4,Late90_5,PropLines_0,PropLines_1,PropLines_2,PropLines_3,PropLines_4,PropLines_5,PropLines_6,Late6089_0,Late6089_1,Late6089_2,Late6089_3,Deps_0.0,Deps_1.0,Deps_2.0,Deps_3.0,Deps_4.0
0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0
1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
2,2,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
3,3,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
4,4,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0


In [91]:
X_train = X_train[:1000]
Y_train = Y_train[:1000]
ktest = ktest[:1000] 

In [92]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

In [93]:
clf = RandomForestClassifier(n_estimators=10, max_features='sqrt')
clf = clf.fit(X_train, Y_train)

In [None]:
parameters = {'n_estimators': 1000, 'random_state' : 20}
    
model = RandomForestClassifier(**parameters)
model.fit(X_train, Y_train)

In [95]:
ktest.head()

Unnamed: 0,Unknown,UnsecLines_0,UnsecLines_1,UnsecLines_2,UnsecLines_3,UnsecLines_4,age_0,age_1,age_2,age_3,age_4,Late3059_0,Late3059_1,Late3059_2,Late3059_3,Late3059_4,Late3059_5,Late3059_6,DebtRatio_0,DebtRatio_1,DebtRatio_2,DebtRatio_3,DebtRatio_4,MonthlyIncome_0,MonthlyIncome_1,MonthlyIncome_2,MonthlyIncome_3,MonthlyIncome_4,OpenCredit_0,OpenCredit_1,OpenCredit_2,OpenCredit_3,OpenCredit_4,Late90_0,Late90_1,Late90_2,Late90_3,Late90_4,Late90_5,PropLines_0,PropLines_1,PropLines_2,PropLines_3,PropLines_4,PropLines_5,PropLines_6,Late6089_0,Late6089_1,Late6089_2,Late6089_3,Deps_0.0,Deps_1.0,Deps_2.0,Deps_3.0,Deps_4.0
146473,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
146474,2,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0
146475,3,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
146476,4,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0
146477,5,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [96]:
result = pd.read_csv("test.csv")

In [97]:
result = result.drop(["RevolvingUtilizationOfUnsecuredLines",
                             "age",
                             "NumberOfTime30-59DaysPastDueNotWorse",
                             "DebtRatio",
                             "MonthlyIncome",
                             "NumberOfOpenCreditLinesAndLoans",
                             "NumberOfTimes90DaysLate",
                             "NumberRealEstateLoansOrLines",
                             "NumberOfTime60-89DaysPastDueNotWorse",
                             "NumberOfDependents"], axis=1)

In [98]:
result.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs
0,1,
1,2,
2,3,
3,4,
4,5,


In [99]:
X_train.shape

(1000, 55)

In [100]:
ktest.shape

(1000, 55)

In [101]:
DefaultProba = model.predict_proba(ktest.drop(["Unknown"], axis=1))
DefaultProba = DefaultProba[:,1]
result.SeriousDlqin2yrs = DefaultProba

result = result.rename(columns={'Unnamed: 0': 'Id',
                                        'SeriousDlqin2yrs': 'Probability'})

ValueError: Number of features of the model must match the input. Model n_features is 55 and input n_features is 54 