In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif

In [45]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [46]:
features = ['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']
df = df[features]

### 1 задание:

In [47]:
education_mode = df['education'].mode()[0]
print(f"Самое частое значение в education: {education_mode}")


Самое частое значение в education: secondary


### 2 задание:

In [48]:
numerical_cols = ['age','balance','day','duration','campaign','pdays','previous']
corr_matrix = df[numerical_cols].corr()
print(corr_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [49]:
max_corr = 0
best_pair = None

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        col1 = corr_matrix.columns[i]
        col2 = corr_matrix.columns[j]
        corr_value = abs(corr_matrix.iloc[i, j])
        if corr_value > max_corr:
            max_corr = corr_value
            best_pair = (col1, col2)
print(f"Наибольшая корреляция между: {best_pair} = {max_corr:.3f}")

Наибольшая корреляция между: ('pdays', 'previous') = 0.455


In [50]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

X = df.drop('y', axis=1)
y = df['y']

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

print(f"Обучающая: {X_train.shape}\nПроверочная: {X_val.shape}\nТестовая: {X_test.shape}")

Обучающая: (27126, 14)
Проверочная: (9042, 14)
Тестовая: (9043, 14)


###

In [53]:
categorical_cols = ['job','marital','education','housing','contact','month','poutcome']

X_train_encoded = X_train.copy()
for col in categorical_cols:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))

mi_scores = mutual_info_classif(X_train_encoded[categorical_cols], y_train, random_state=42)
mi_df = pd.DataFrame({'feature': categorical_cols, 'score': mi_scores})
mi_df = mi_df.sort_values('score', ascending=False)

best_mi_feature = mi_df.iloc[0]['feature']
print(f"Наибольшая mutual information у: {best_mi_feature}")

Наибольшая mutual information у: poutcome


### 4 задание:

In [54]:
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=categorical_cols, drop_first=True)

common_cols = X_train_encoded.columns.intersection(X_val_encoded.columns)
X_train_encoded = X_train_encoded[common_cols]
X_val_encoded = X_val_encoded[common_cols]

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

y_val_pred = model.predict(X_val_encoded)
accuracy = accuracy_score(y_val, y_val_pred)

print(f"Точность на проверочной выборке: {accuracy:.2f}")

Точность на проверочной выборке: 0.90


### 5 задание:

In [55]:
features_to_remove = ['age','balance','marital','previous']
differences = {}

for feature in features_to_remove:
    X_train_reduced = X_train.drop(feature, axis=1)
    X_val_reduced = X_val.drop(feature, axis=1)

    reduced_categorical = [col for col in categorical_cols if col != feature]
    X_train_reduced_encoded = pd.get_dummies(X_train_reduced, columns=reduced_categorical, drop_first=True)
    X_val_reduced_encoded = pd.get_dummies(X_val_reduced, columns=reduced_categorical, drop_first=True)

    common_cols_reduced = X_train_reduced_encoded.columns.intersection(X_val_reduced_encoded.columns)
    X_train_reduced_encoded = X_train_reduced_encoded[common_cols_reduced]
    X_val_reduced_encoded = X_val_reduced_encoded[common_cols_reduced]

    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced_encoded, y_train)

    y_val_pred_reduced = model_reduced.predict(X_val_reduced_encoded)
    accuracy_reduced = accuracy_score(y_val, y_val_pred_reduced)

    diff = accuracy - accuracy_reduced
    differences[feature] = diff

min_diff_feature = min(differences, key=lambda x: abs(differences[x]))
print(f"Наименьшая разница у признака: {min_diff_feature}")

Наименьшая разница у признака: balance


### 6 задание:

In [56]:
C_values = [0.01, 0.1, 1, 10]
best_C = None
best_accuracy = 0

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)

    y_val_pred_reg = model_reg.predict(X_val_encoded)
    accuracy_reg = accuracy_score(y_val, y_val_pred_reg)

    if accuracy_reg > best_accuracy:
        best_accuracy = accuracy_reg
        best_C = C

print(f"Лучшее значение C: {best_C}")

Лучшее значение C: 10


# Ответы:
### 1.secondary
### 2.pdays,previous
### 3.poutcome
### 4.0.90
### 5.balance
### 6.10