In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)

In [58]:
df = pd.read_csv("../datasets/6-bank_customers.csv")

In [59]:
df.columns

Index(['age', 'job_satisfaction', 'balance', 'duration_last_call',
       'num_contacts_last_month', 'has_housing_loan', 'has_personal_loan',
       'communication_type', 'days_since_last_contact',
       'campaign_response_score', 'subscribed'],
      dtype='object')

In [60]:
df.head()

Unnamed: 0,age,job_satisfaction,balance,duration_last_call,num_contacts_last_month,has_housing_loan,has_personal_loan,communication_type,days_since_last_contact,campaign_response_score,subscribed
0,-0.377957,1.043895,1.043494,-0.101838,-1.617442,0.402713,0.913601,-0.067192,0.175471,-1.049646,0
1,-0.325259,1.276263,-0.686123,-2.463205,-0.489426,-0.240715,-1.469496,1.006633,-0.833692,0.957744,0
2,0.739019,-0.600903,-0.177294,1.335714,-0.817332,-0.790047,1.457365,-0.218981,0.878643,-1.25774,0
3,0.474312,-1.103002,1.189936,-0.800186,0.912377,-0.406451,-1.13095,1.985111,1.379029,1.041768,1
4,0.927365,1.114796,0.080284,1.261064,0.761179,0.921563,0.440832,0.184645,-1.567739,-0.142107,1


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      1000 non-null   float64
 1   job_satisfaction         1000 non-null   float64
 2   balance                  1000 non-null   float64
 3   duration_last_call       1000 non-null   float64
 4   num_contacts_last_month  1000 non-null   float64
 5   has_housing_loan         1000 non-null   float64
 6   has_personal_loan        1000 non-null   float64
 7   communication_type       1000 non-null   float64
 8   days_since_last_contact  1000 non-null   float64
 9   campaign_response_score  1000 non-null   float64
 10  subscribed               1000 non-null   int64  
dtypes: float64(10), int64(1)
memory usage: 86.1 KB


In [62]:
X = df.drop("subscribed", axis=1)
y = df["subscribed"]

In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [64]:
from sklearn.linear_model import LogisticRegression

model= LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [65]:
y_pred # customers who subscribed or not / binary

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0])

In [66]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# classification report: precision, recall, f1-score

In [67]:
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {score:.2f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       157
           1       0.92      0.90      0.91       143

    accuracy                           0.92       300
   macro avg       0.92      0.92      0.92       300
weighted avg       0.92      0.92      0.92       300

Confusion Matrix:
[[146  11]
 [ 14 129]]


### Hyperparameter Tuning

In [68]:
model = LogisticRegression()

Logistic Regression parametrelerini sırayla deneyip hangisinin en iyi olduğunu bulmaya çalışacağız
* https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [69]:
c_values = [0.01, 0.1, 1, 10, 100] # Regularization strength

* Düşük C değerleri(0.01,0.1) -> Güçlü düzenlileştirme anlamına gelir.Modelin katsayılarına büyük bir ceza uygulanır, bu da katsayıları sıfıra doğru daha çok çeker. Model daha basit hale gelir ve aşırı öğrenme riski azalır. Ancak çok düşük bir C değeri, modelin veriyi yeterince öğrenememesine (underfitting) neden olabilir.

* Yüksek C Değeri (örn: 10, 100):
Zayıf düzenlileştirme anlamına gelir.
Modelin katsayılarına daha az ceza uygulanır.
Model, eğitim verisindeki karmaşık ilişkileri öğrenmekte daha özgürdür. Bu durum, modelin aşırı öğrenme (overfitting) riskini artırabilir.

* solver (Optimizasyon Algoritması):
Bu parametre, modelin en uygun katsayıları bulmak için kullanacağı optimizasyon algoritmasını belirler. Her algoritmanın farklı avantajları vardır ve hepsi her penalty türüyle uyumlu değildir


* Hiperparametreler GridSearchCV'de kullanılacağı için dictionary formatında tanımlıyoruz
* GridSearchCV: en iyi hiperparametreleri bulmak için kullanılır

In [70]:
param_grid = [
    # 1. Grup: 'l1' penalty'si ile uyumlu solver'lar
    {
        'penalty': ['l1'],
        'solver': ['liblinear', 'saga'], # 'l1' sadece bu ikisiyle çalışır
        'C': c_values
    },
    # 2. Grup: 'l2' penalty'si ile uyumlu solver'lar
    {
        'penalty': ['l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], # 'l2' hepsiyle çalışır
        'C': c_values
    },
    # 3. Grup: 'elasticnet' penalty'si ile uyumlu solver'lar
    {
        'penalty': ['elasticnet'],
        'solver': ['saga'], # 'elasticnet' sadece 'saga' ile çalışır
        'C': c_values
    }
]

### Fine Tune -> GridSearchCV

In [71]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [72]:
cv = StratifiedKFold() 
# GridSearchCV cross validation için bizden StratifiedKFold türünden bekliyor 

In [73]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=0)

In [74]:
grid

0,1,2
,estimator,LogisticRegression()
,param_grid,"[{'C': [0.01, 0.1, ...], 'penalty': ['l1'], 'solver': ['liblinear', 'saga']}, {'C': [0.01, 0.1, ...], 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', ...]}, ...]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [75]:
grid.fit(X_train, y_train)

 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714
 0.92285714 0.91285714 0.91285714 0.91285714 0.91285714 0.91857143
 0.91285714 0.91285714 0.91142857 0.91142857 0.91142857 0.91142857
 0.91142857 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714
 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714        nan
        nan        nan        nan        nan]


0,1,2
,estimator,LogisticRegression()
,param_grid,"[{'C': [0.01, 0.1, ...], 'penalty': ['l1'], 'solver': ['liblinear', 'saga']}, {'C': [0.01, 0.1, ...], 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', ...]}, ...]"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100


In [76]:
grid.best_params_

{'C': 0.01, 'penalty': 'l1', 'solver': 'saga'}

In [77]:
grid.best_score_

np.float64(0.9242857142857142)

In [78]:
y_pred = grid.predict(X_test)

In [79]:
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {score:.2f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93       157
           1       0.94      0.89      0.91       143

    accuracy                           0.92       300
   macro avg       0.92      0.92      0.92       300
weighted avg       0.92      0.92      0.92       300

Confusion Matrix:
[[149   8]
 [ 16 127]]


### Fine Tune -> Random Search CV

In [80]:
from sklearn.model_selection import RandomizedSearchCV

In [81]:
model2 = LogisticRegression()

In [82]:
random_cv = RandomizedSearchCV(model2, param_distributions=param_grid, n_iter=100, cv=cv, verbose=0, random_state=42, n_jobs=-1)

In [83]:
random_cv.fit(X_train, y_train)

 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714
 0.92285714 0.91285714 0.91285714 0.91285714 0.91285714 0.91857143
 0.91285714 0.91285714 0.91142857 0.91142857 0.91142857 0.91142857
 0.91142857 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714
 0.91285714 0.91285714 0.91285714 0.91285714 0.91285714        nan
        nan        nan        nan        nan]


0,1,2
,estimator,LogisticRegression()
,param_distributions,"[{'C': [0.01, 0.1, ...], 'penalty': ['l1'], 'solver': ['liblinear', 'saga']}, {'C': [0.01, 0.1, ...], 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', ...]}, ...]"
,n_iter,100
,scoring,
,n_jobs,-1
,refit,True
,cv,StratifiedKFo...shuffle=False)
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.01
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,100


In [84]:
random_cv.best_params_

{'solver': 'saga', 'penalty': 'l1', 'C': 0.01}

In [85]:
random_cv.best_score_

np.float64(0.9242857142857142)

In [86]:
y_pred = random_cv.predict(X_test)

In [87]:
score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {score:.2f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.95      0.93       157
           1       0.94      0.89      0.91       143

    accuracy                           0.92       300
   macro avg       0.92      0.92      0.92       300
weighted avg       0.92      0.92      0.92       300

Confusion Matrix:
[[149   8]
 [ 16 127]]


* verbose=0: Hiçbir ilerleme raporu gösterme. Sessiz mod.
* verbose=1: Ne kadar deneme yapıldığını ve ne kadar sürdüğünü göster.
* verbose=2: Hangi parametrelerin denendiğini ve her birinin skorunu göster.