# Model Development
# Detecting Outliers & Removing them

In [1]:
#importing necessary 
import pandas as pd
import numpy as np
import re
import sklearn
import seaborn as sns
from collections import Counter
import warnings
import pickle
warnings.filterwarnings('ignore')

sns.set(style='white', context='notebook', palette='deep')
pd.options.display.max_columns = 100

In [2]:
ktrain = pd.read_csv("train.csv") #train kaggle ds
ktest = pd.read_csv("test.csv")   #test  kaggle ds

In [3]:
def d_out(df,n,feat):
    out_indi = []   #to store the indices of all the outliers found
    
    #Iterating over columns of features to calculate quartile 
    for c in feat:
        Q1 = np.percentile(df[c], 25)  #  25%
        Q3 = np.percentile(df[c],75)   #  75%
        IQR = Q3 - Q1                  # Interquartile range 
        
        out_step = 1.5 * IQR        # finding outlier step
        out_list_c = df[(df[c] < Q1 - out_step) | (df[c] > Q3 + out_step )].index   # Determining a list of outliers
        # append found indices for col to the list of outlier indices 
        out_indi.extend(out_list_c)
        
    # select observations containing more than 2 outliers
    out_indi = Counter(out_indi)        
    multi_out = list( k for k, v in out_indi.items() if v > n )
    return multi_out
#FInding outliers for all the other features with numeric values 
Out_drop = d_out(ktrain,2,["RevolvingUtilizationOfUnsecuredLines",
                                            "age",
                                            "NumberOfTime30-59DaysPastDueNotWorse",
                                            "DebtRatio",
                                            "MonthlyIncome",
                                            "NumberOfOpenCreditLinesAndLoans",
                                            "NumberOfTimes90DaysLate",
                                            "NumberRealEstateLoansOrLines",
                                            "NumberOfTime60-89DaysPastDueNotWorse",
                                            "Unnamed: 0",
                                            "NumberOfDependents"])

In [4]:
ktrain.loc[Out_drop]

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
900,901,0,2.258964,33,2,0.032484,2000.0,1,1,0,0,2.0
919,920,1,1.362142,73,1,274.000000,,2,3,0,1,
1958,1959,0,1.634551,28,0,151.000000,,1,3,0,2,0.0
4852,4853,0,1.566866,46,2,0.104983,6000.0,3,2,0,0,0.0
5956,5957,0,2.237856,60,6,2597.000000,1.0,16,1,2,1,4.0
6251,6252,1,1.481038,26,0,22.000000,,1,0,0,1,0.0
6703,6704,0,1.421927,50,1,0.058003,4085.0,2,0,0,1,2.0
6907,6908,0,1.548094,24,0,37.000000,,3,0,0,2,
7079,7080,0,1.485050,66,1,1049.000000,,3,0,0,0,0.0
7352,7353,1,1.650560,35,1,1596.000000,,13,0,0,0,2.0


In [5]:
ktrain = ktrain.drop(Out_drop, axis = 0).reset_index(drop=True) 
#Dropping the outliers found. We can see that we have 3527 rows of outliers present in our ds.

In [6]:
ktrainlen = len(ktrain)
ds =  pd.concat(objs=[ktrain, ktest], axis=0).reset_index(drop=True) #Concating test & train ds

In [7]:
#Renaming 
ds = ds.rename(columns={'Unnamed: 0': 'Unknown',
                                  'SeriousDlqin2yrs': 'Target',
                                  'RevolvingUtilizationOfUnsecuredLines': 'UnsecLines',
                                  'NumberOfTime30-59DaysPastDueNotWorse': 'Late3059',
                                  'DebtRatio': 'DebtRatio',
                                  'MonthlyIncome': 'MonthlyIncome',
                                  'NumberOfOpenCreditLinesAndLoans': 'OpenCredit',
                                  'NumberOfTimes90DaysLate': 'Late90',
                                  'NumberRealEstateLoansOrLines': 'PropLines',
                                  'NumberOfTime60-89DaysPastDueNotWorse': 'Late6089',
                                  'NumberOfDependents': 'Deps'})

ktrain = ktrain.rename(columns={'Unnamed: 0': 'Unknown',
                                  'SeriousDlqin2yrs': 'Target',
                                  'RevolvingUtilizationOfUnsecuredLines': 'UnsecLines',
                                  'NumberOfTime30-59DaysPastDueNotWorse': 'Late3059',
                                  'DebtRatio': 'DebtRatio',
                                  'MonthlyIncome': 'MonthlyIncome',
                                  'NumberOfOpenCreditLinesAndLoans': 'OpenCredit',
                                  'NumberOfTimes90DaysLate': 'Late90',
                                  'NumberRealEstateLoansOrLines': 'PropLines',
                                  'NumberOfTime60-89DaysPastDueNotWorse': 'Late6089',
                                  'NumberOfDependents': 'Deps'})

ktest = ktest.rename(columns={'Unnamed: 0': 'Unknown',
                                  'SeriousDlqin2yrs': 'Target',
                                  'RevolvingUtilizationOfUnsecuredLines': 'UnsecLines',
                                  'NumberOfTime30-59DaysPastDueNotWorse': 'Late3059',
                                  'DebtRatio': 'DebtRatio',
                                  'MonthlyIncome': 'MonthlyIncome',
                                  'NumberOfOpenCreditLinesAndLoans': 'OpenCredit',
                                  'NumberOfTimes90DaysLate': 'Late90',
                                  'NumberRealEstateLoansOrLines': 'PropLines',
                                  'NumberOfTime60-89DaysPastDueNotWorse': 'Late6089',
                                  'NumberOfDependents': 'Deps'})

# Dataset Cleansing & Normalization 

In [8]:
ds.UnsecLines = pd.qcut(ds.UnsecLines.values, 5).codes
#using qcut to divide Unsecured Lines values into bins of 5 

In [9]:
ds.age = pd.qcut(ds.age.values, 5).codes
#using qcut to divide age values into bins of 5 

In [10]:
for i in range(len(ds)):
    if ds.Late3059[i] >= 6:
        ds.Late3059[i] = 6
#Due to high sd of data grouping all the data valus over 6 as 6

In [11]:
ds.DebtRatio = pd.qcut(ds.DebtRatio.values, 5).codes 
#using qcut to divide Debt Ratio values into bins of 5 

In [12]:
ds.MonthlyIncome.isnull().sum() 
#Checking for NaN values in Monthly Income Column 

48352

In [13]:
ds.MonthlyIncome = ds.MonthlyIncome.fillna(ds.MonthlyIncome.median())
#Fillin NaN values of Monthly Income with Median Value 

In [14]:
ds.MonthlyIncome = pd.qcut(ds.MonthlyIncome.values, 5).codes
#using qcut to divide Debt Ratio values into bins of 5

In [15]:
ds.OpenCredit = pd.qcut(ds.OpenCredit.values, 5).codes
#using qcut to divide Debt Ratio values into bins of 5

In [16]:
for i in range(len(ds)):
    if ds.Late90[i] >= 5:
        ds.Late90[i] = 5
#Due to high sd of data grouping all the data valus over 5 as 5

In [17]:
for i in range(len(ds)):
    if ds.PropLines[i] >= 6:
        ds.PropLines[i] = 6
#Due to high sd of data grouping all the data valus over 6 as 6

In [18]:
for i in range(len(ds)):
    if ds.Late6089[i] >= 3:
        ds.Late6089[i] = 3
#Due to high sd of data grouping all the data valus over 3 as 3

In [19]:
ds.Deps = ds.Deps.fillna(ds.Deps.median())
#Fillin NaN values of Dependents with Median Value

In [20]:
ds.Deps.isnull().sum() #Checking for NaN

0

In [21]:
for i in range(len(ds)):
    if ds.Deps[i] >= 4:
        ds.Deps[i] = 4
#Due to high sd of data grouping all the data valus over 4 as 4

In [22]:
ds.head()

Unnamed: 0,Unknown,Target,UnsecLines,age,Late3059,DebtRatio,MonthlyIncome,OpenCredit,Late90,PropLines,Late6089,Deps
0,1,1.0,4,1,2,3,4,4,0,6,0,2.0
1,2,0.0,4,1,0,0,0,0,0,0,0,1.0
2,3,0.0,3,0,1,0,0,0,1,0,0,0.0
3,4,0.0,2,0,0,0,0,1,0,0,0,0.0
4,5,0.0,4,2,1,0,4,1,0,1,0,0.0


In [23]:
#using get_dummies fucntion to convert categorical variable into dummy/indicator variables
ds = pd.get_dummies(ds, columns = ["UnsecLines"], prefix="UnsecLines")
ds = pd.get_dummies(ds, columns = ["age"], prefix="age")
ds = pd.get_dummies(ds, columns = ["Late3059"], prefix="Late3059")
ds = pd.get_dummies(ds, columns = ["DebtRatio"], prefix="DebtRatio")
ds = pd.get_dummies(ds, columns = ["MonthlyIncome"], prefix="MonthlyIncome")
ds = pd.get_dummies(ds, columns = ["OpenCredit"], prefix="OpenCredit")
ds = pd.get_dummies(ds, columns = ["Late90"], prefix="Late90")
ds = pd.get_dummies(ds, columns = ["PropLines"], prefix="PropLines")
ds = pd.get_dummies(ds, columns = ["Late6089"], prefix="Late6089")
ds = pd.get_dummies(ds, columns = ["Deps"], prefix="Deps")

In [24]:
#Dividing training & testing data
#dropping the target values from test data
ktrain = ds[:ktrainlen]
ktest = ds[ktrainlen:]
ktest.drop(labels=["Target"],axis = 1,inplace=True)

In [25]:
ktest.shape

(101503, 55)

In [26]:
#Dividing label values & train feature values
ktrain["Target"] = ktrain["Target"].astype(int)
Yt = ktrain["Target"]
Xt = ktrain.drop(labels = ["Target", "Unknown"],axis = 1)

In [27]:
Xt = Xt[:1000]
Yt = Yt[:1000]
ktest = ktest[:1000] 

# Model fitting 

In [28]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib 
from sklearn.metrics import accuracy_score

In [29]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt') 
clf = clf.fit(Xt, Yt)

In [30]:
parameters = {'n_estimators': 1000, 'random_state' : 20}
model = RandomForestClassifier(**parameters)
model.fit(Xt, Yt)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=20, verbose=0,
                       warm_start=False)

In [31]:
results_df = pd.read_csv("test.csv")

In [32]:
results_df = results_df.drop(["RevolvingUtilizationOfUnsecuredLines",
                             "age",
                             "NumberOfTime30-59DaysPastDueNotWorse",
                             "DebtRatio",
                             "MonthlyIncome",
                             "NumberOfOpenCreditLinesAndLoans",
                             "NumberOfTimes90DaysLate",
                             "NumberRealEstateLoansOrLines",
                             "NumberOfTime60-89DaysPastDueNotWorse",
                             "NumberOfDependents"], axis=1)

In [38]:
#joblib.dump(model, 'Saved_Model.pk1')
type(Yt.values)

numpy.ndarray

In [34]:
DefP = model.predict_proba(ktest.drop(["Unknown"], axis=1)) #Predicitng the final score
DefP = DefP[:,1]
#results_df.SeriousDlqin2yrs = DefP

In [45]:
print(accuracy_score(Yt.values, DefP)

SyntaxError: unexpected EOF while parsing (<ipython-input-45-1eff03617c0f>, line 1)

In [None]:
print(DefP)

In [None]:
#results_df = results_df.rename(columns={'Unnamed: 0': 'Id',
                                        'SeriousDlqin2yrs': 'Probability'})

In [42]:
#results_df.to_csv("KAGGLE_CREDIT_SCORE.csv", index=False)
Yt.head()

0    1
1    0
2    0
3    0
4    0
Name: Target, dtype: int64