In [1]:
import pandas as pd

# Загрузка данных из CSV файла

df = pd.read_csv(r'bank.csv', sep=';')

print(df.head())

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  


In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB
None


In [3]:
print(df.isna().sum() / len(df) * 100)

age          0.0
job          0.0
marital      0.0
education    0.0
default      0.0
balance      0.0
housing      0.0
loan         0.0
contact      0.0
day          0.0
month        0.0
duration     0.0
campaign     0.0
pdays        0.0
previous     0.0
poutcome     0.0
y            0.0
dtype: float64


In [4]:
df['default_encoded'] = df['default'].map({'no' : 0, 'yes' : 1})
df = df.drop_duplicates()

for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

df = df.drop(columns=['default', 'y'], axis=1)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              4521 non-null   int64 
 1   job              4521 non-null   object
 2   marital          4521 non-null   object
 3   education        4521 non-null   object
 4   balance          4521 non-null   int64 
 5   housing          4521 non-null   object
 6   loan             4521 non-null   object
 7   contact          4521 non-null   object
 8   day              4521 non-null   int64 
 9   month            4521 non-null   object
 10  duration         4521 non-null   int64 
 11  campaign         4521 non-null   int64 
 12  pdays            4521 non-null   int64 
 13  previous         4521 non-null   int64 
 14  poutcome         4521 non-null   object
 15  default_encoded  4521 non-null   int64 
dtypes: int64(8), object(8)
memory usage: 565.3+ KB
None


In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns='default_encoded', axis=1)
y = df['default_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, make_scorer

model = LogisticRegression(max_iter=1000)
roc_auc = make_scorer(roc_auc_score, needs_proba=True)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)

roc_auc_lr = roc_auc_score(y_test, y_pred)

print(classification_report(y_test, y_pred, zero_division=0.0))
print(f'Логистическая регрессия без CV \nROC-AUC: {roc_auc_lr:.4f}')

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1334
           1       0.00      0.00      0.00        23

    accuracy                           0.98      1357
   macro avg       0.49      0.50      0.50      1357
weighted avg       0.97      0.98      0.97      1357

Логистическая регрессия без CV 
ROC-AUC: 0.5000


In [7]:
import numpy as np

cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"Средний ROC AUC на кросс-валидации: {np.mean(cv_scores):.4f}")

Средний ROC AUC на кросс-валидации: 0.8253


In [8]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', probability=True, random_state=42)

svm.fit(X_train_scaled, y_train)

y_svm_pred = svm.predict(X_test_scaled)
y_svm_prob = svm.predict_proba(X_test_scaled)

roc_auc_svm = roc_auc_score(y_test, y_svm_pred)

print(classification_report(y_test, y_pred, zero_division=0.0))
print(f'Логистическая регрессия без CV \nROC-AUC: {roc_auc_svm:.4f}')


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1334
           1       0.00      0.00      0.00        23

    accuracy                           0.98      1357
   macro avg       0.49      0.50      0.50      1357
weighted avg       0.97      0.98      0.97      1357

Логистическая регрессия без CV 
ROC-AUC: 0.5000


In [9]:
cv_scores = cross_val_score(svm, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"Средний ROC AUC на кросс-валидации: {np.mean(cv_scores):.4f}")

Средний ROC AUC на кросс-валидации: 0.6660


In [10]:
# Логистическая регрессия
roc_auc_lr = roc_auc_score(y_test, y_pred)
cv_scores_lr = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
cv_mean_lr = np.mean(cv_scores_lr)

# SVM
roc_auc_svm = roc_auc_score(y_test, y_svm_pred)
cv_scores_svm = cross_val_score(svm, X_train_scaled, y_train, cv=5, scoring='roc_auc')
cv_mean_svm = np.mean(cv_scores_svm)

# Таблица сравнения
results = pd.DataFrame({
    'Модель': ['Logistic Regression', 'SVM'],
    'ROC-AUC (без CV)': [roc_auc_lr, roc_auc_svm],
    'ROC-AUC (CV)': [cv_mean_lr, cv_mean_svm]
})

print(results)

                Модель  ROC-AUC (без CV)  ROC-AUC (CV)
0  Logistic Regression               0.5      0.825309
1                  SVM               0.5      0.665970
