# Scikit-Learn ile Kredi Tahmini

## Kütüphanelerin eklenmesi

In [1]:
import pandas as pd, numpy as np
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import GridSearchCV

import sklearn
from sklearn_pandas import DataFrameMapper

import matplotlib.pyplot as plt

### Adım 1:
Elimizdeki verisetini ekleyelim:

In [2]:
loans = pd.read_csv("krediVeriseti.csv", sep=";")
loans.head(10)

Unnamed: 0,krediMiktari,yas,evDurumu,aldigi_kredi_sayi,telefonDurumu,KrediDurumu
0,1169,67,evsahibi,2,var,krediver
1,5951,22,evsahibi,1,yok,verme
2,2096,49,evsahibi,1,yok,krediver
3,7882,45,kiraci,1,yok,krediver
4,4870,53,kiraci,2,yok,verme
5,9055,35,kiraci,1,var,krediver
6,2835,53,evsahibi,1,yok,krediver
7,6948,35,kiraci,1,var,krediver
8,3059,61,evsahibi,1,yok,krediver
9,5234,28,evsahibi,2,yok,verme


### Adım 2:  
Öznitelikleri ve sonuc sütunlarını belirleyelim:

In [3]:
features = ['krediMiktari',
            'yas',
            'evDurumu',
            'aldigi_kredi_sayi',
            'telefonDurumu',
           ]
result='KrediDurumu'

In [4]:
clean_data=loans[features+[result]].dropna()
clean_data.head()

Unnamed: 0,krediMiktari,yas,evDurumu,aldigi_kredi_sayi,telefonDurumu,KrediDurumu
0,1169,67,evsahibi,2,var,krediver
1,5951,22,evsahibi,1,yok,verme
2,2096,49,evsahibi,1,yok,krediver
3,7882,45,kiraci,1,yok,krediver
4,4870,53,kiraci,2,yok,verme


### Adım 3:
Sayısal ve kategorikal verileri ayırıp işleyip yeniden birleştirelim:

In [5]:
numerical_cols=['krediMiktari', 'yas', 'aldigi_kredi_sayi']

categorical_cols=['evDurumu', 'telefonDurumu']

mapper_features = DataFrameMapper([
('evDurumu',sklearn.preprocessing.LabelBinarizer()),
('telefonDurumu', sklearn.preprocessing.LabelBinarizer()),
    ])

X1=mapper_features.fit_transform(clean_data)


X2=np.array(clean_data[numerical_cols])


X = np.hstack((X1,X2))

y=np.array(sklearn.preprocessing.LabelBinarizer().fit_transform(clean_data[result]))

In [6]:
X

array([[   0,    0, 1169,   67,    2],
       [   0,    1, 5951,   22,    1],
       [   0,    1, 2096,   49,    1],
       ...,
       [   0,    1,  804,   38,    1],
       [   1,    0, 1845,   23,    1],
       [   0,    1, 4576,   27,    1]], dtype=int64)

### Adım 4:
Verilerimizi train ve test olarak ayıralım:

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100, stratify=y)

### Adım 5:
Modelleri hazırlayıp oranlarını görelim:

In [20]:
# Sınıflandırma Modellerine Ait Kütüphaneler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier

# Modelleri Hazırlayalım
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('GradientBoosting', GradientBoostingClassifier(n_estimators=100)))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Decision Tree (NoParam)', DecisionTreeClassifier())) 
models.append(('Decision Tree (GridSearch)', GridSearchCV(DecisionTreeClassifier(), {'max_depth':[5, 10, 15, 20, 25, 32]}, cv=5)))
models.append(('RandomForestClassifier (GridSearch)', GridSearchCV(RandomForestClassifier(), {'max_depth':[5, 15], 'n_estimators':[10,30]})))
models.append(('RandomForestClassifier (2 Param)', RandomForestClassifier(n_estimators=10, criterion='entropy')))
models.append(('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=5, metric='minkowski')))
# models.append(('Support Vector Regression', SVR(kernel='rbf')))
models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis()))
models.append(('AdaBoostClassifier', AdaBoostClassifier(learning_rate=0.5)))
models.append(('BaggingClassifier', BaggingClassifier()))

model_name = []
acc_score = []
from sklearn.metrics import classification_report
# Modelleri test edelim
for name, model in models:
    model = model.fit(X_train, y_train.ravel())
    y_pred = model.predict(X_test)
    from sklearn import metrics
    # print("%s -> ACC: %%%.2f" % (name,metrics.accuracy_score(y_test, y_pred)*100))
    model_name.append(name)
    acc_score.append((metrics.accuracy_score(y_test, y_pred)*100))
    print('Model: ', name)
    print('Confusion Matrix: \n', metrics.confusion_matrix(y_test, y_pred))
    #Detaylı sınıflandırma raporu...
    report = classification_report(y_test, y_pred)
    print('\n ',report)
    
col={'Model':model_name,'Oran':acc_score}
comp = pd.DataFrame(data=col)
comp

Model:  Logistic Regression
Confusion Matrix: 
 [[228   3]
 [ 87  12]]

               precision    recall  f1-score   support

          0       0.72      0.99      0.84       231
          1       0.80      0.12      0.21        99

avg / total       0.75      0.73      0.65       330

Model:  GradientBoosting
Confusion Matrix: 
 [[208  23]
 [ 82  17]]

               precision    recall  f1-score   support

          0       0.72      0.90      0.80       231
          1       0.42      0.17      0.24        99

avg / total       0.63      0.68      0.63       330

Model:  Naive Bayes
Confusion Matrix: 
 [[216  15]
 [ 83  16]]

               precision    recall  f1-score   support

          0       0.72      0.94      0.82       231
          1       0.52      0.16      0.25        99

avg / total       0.66      0.70      0.64       330

Model:  Decision Tree (NoParam)
Confusion Matrix: 
 [[169  62]
 [ 60  39]]

               precision    recall  f1-score   support

          0 

Unnamed: 0,Model,Oran
0,Logistic Regression,72.727273
1,GradientBoosting,68.181818
2,Naive Bayes,70.30303
3,Decision Tree (NoParam),63.030303
4,Decision Tree (GridSearch),66.666667
5,RandomForestClassifier (GridSearch),69.69697
6,RandomForestClassifier (2 Param),67.575758
7,KNeighborsClassifier,66.969697
8,LinearDiscriminantAnalysis,72.727273
9,AdaBoostClassifier,69.69697


In [25]:
import pickle
with open('models.pickle', 'wb') as output:
    pickle.dump(models, output)

with open('X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)

with open('y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)

with open('mapper_features.pickle', 'wb') as output:
    pickle.dump(mapper_features, output)

### Adım 6:
Şimdi yeni bir veri ile sonucu görelim:

In [22]:
def preProcess(a):
    data=list(a.values())
    colz=list(a.keys())
    dfx=pd.DataFrame(data=[data], columns=colz)

    XX1=mapper_features.transform(dfx)
    XX2=dfx[numerical_cols]
    XX = np.hstack((XX1,XX2))
    return XX

In [23]:
sample_data={ 'krediMiktari': 5000,
 'yas': 50,
 'aldigi_kredi_sayi': 5,
 'evDurumu': 'evsahibi',
 'telefonDurumu': 'var'}

sample_result=preProcess(sample_data)
sample_result

array([[   0,    0, 5000,   50,    5]], dtype=int64)

In [24]:
model_fitted_name = []
acc_score_model = []
from sklearn.metrics import classification_report
# Modelleri test edelim
for name, model in models:
    model = model.fit(X_train, y_train.ravel())
    from sklearn import metrics
    model_fitted_name.append(name)
    acc_score_model.append(((model.predict_proba(sample_result)[:,0][0])*100))
    
columns = {'Model':model_fitted_name,'Oran':acc_score_model}
results = pd.DataFrame(data=columns)


def karar_ver(oran):
    return (oran >= 75)

results['Karar'] = results['Oran'].apply(karar_ver)
results
#results.sort_values('Oran', ascending=False)

Unnamed: 0,Model,Oran,Karar
0,Logistic Regression,89.831434,True
1,GradientBoosting,86.662286,True
2,Naive Bayes,90.075094,True
3,Decision Tree (NoParam),100.0,True
4,Decision Tree (GridSearch),83.333333,True
5,RandomForestClassifier (GridSearch),74.93525,False
6,RandomForestClassifier (2 Param),80.0,True
7,KNeighborsClassifier,40.0,False
8,LinearDiscriminantAnalysis,88.415535,True
9,AdaBoostClassifier,50.94742,False


## Sonuç :
Oranlarımızın yüzde 75 ve üstünü eşik belirledik. True olarak yani "krediver" olarak sonucu görüyoruz.