In [194]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif, RFE, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [195]:
train_data = pd.read_csv("C:\\Users\\fongr\\Documents\\Мои документы\\Второй курс\\python\\scientificProject1\\data\\train.csv")
test_data = pd.read_csv("C:\\Users\\fongr\\Documents\\Мои документы\\Второй курс\\python\\scientificProject1\\data\\test.csv")

In [196]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229292 entries, 0 to 229291
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    229292 non-null  int64  
 1   Gender                229292 non-null  object 
 2   Age                   229292 non-null  int64  
 3   Driving_License       229292 non-null  int64  
 4   Region_Code           229292 non-null  float64
 5   Previously_Insured    229292 non-null  int64  
 6   Vehicle_Age           229292 non-null  object 
 7   Vehicle_Damage        229292 non-null  object 
 8   Annual_Premium        229292 non-null  float64
 9   Policy_Sales_Channel  229292 non-null  float64
 10  Vintage               229292 non-null  int64  
 11  Response              229292 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 21.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152862 entries, 0 to 152861
Data columns (total 

Из info видно, что ни в одном столбце нет пустых значений. Выполним деление на числовые и категориальные признаки

In [197]:
df_cat = ["Gender", "Vehicle_Age", "Vehicle_Damage"]
df_num = ["Age", "Driving_License", "Previously_Insured", "Region_Code", "Annual_Premium", "Policy_Sales_Channel", "Vintage"]
df_all = df_num + df_cat
result_col = "Response"
train_data.drop("id", axis=1, inplace=True)

In [198]:
for col in df_cat:
    print(train_data[col].unique())

['Male' 'Female']
['1-2 Year' '< 1 Year' '> 2 Years']
['No' 'Yes']


In [199]:
for col in df_num:
    print(train_data[col].describe())

count    229292.000000
mean         38.554987
std          15.231658
min          20.000000
25%          25.000000
50%          36.000000
75%          49.000000
max          85.000000
Name: Age, dtype: float64
count    229292.000000
mean          0.998138
std           0.043114
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: Driving_License, dtype: float64
count    229292.000000
mean          0.489062
std           0.499881
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: Previously_Insured, dtype: float64
count    229292.000000
mean         26.420207
std          13.191474
min           0.000000
25%          15.000000
50%          28.000000
75%          35.000000
max          52.000000
Name: Region_Code, dtype: float64
count    229292.000000
mean      30730.076549
std       17054.686028
min        2630.000000
25%       24564.000000
50%       

Произведём кодирование категориальных признаков

In [200]:
le = LabelEncoder()
for column in df_cat:
    train_data[column] = le.fit_transform(train_data[column])
    test_data[column] = le.fit_transform(test_data[column])
    train_data[column] = train_data[column].astype("category")
    test_data[column] = test_data[column].astype("category")
    print(train_data[column].unique())

[1, 0]
Categories (2, int64): [0, 1]
[0, 1, 2]
Categories (3, int64): [0, 1, 2]
[0, 1]
Categories (2, int64): [0, 1]


Произведём нормирование всех числовых признаков

In [201]:
scaler = MinMaxScaler()
train_data[df_num] = scaler.fit_transform(train_data[df_num])
test_data[df_num] = scaler.fit_transform(test_data[df_num])
train_data.describe()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,229292.0,229292.0,229292.0,229292.0,229292.0,229292.0,229292.0,229292.0
mean,0.285461,0.998138,0.508081,0.489062,0.052276,0.684714,0.498943,0.164079
std,0.234333,0.043114,0.253682,0.499881,0.031728,0.335236,0.289713,0.370348
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.076923,1.0,0.288462,0.0,0.040805,0.154321,0.249135,0.0
50%,0.246154,1.0,0.538462,0.0,0.054097,0.858025,0.49827,0.0
75%,0.446154,1.0,0.673077,1.0,0.068535,0.932099,0.750865,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Отбор информативных признаков

In [202]:
x = train_data[df_all]
y = train_data[result_col]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=7)
masks = scoring = tuple()
n_features_select = 5

### Метод фильтрации

In [203]:
skb = SelectKBest(k=n_features_select)
top = skb.fit(x, y)
selected_features_mutual_info = top.get_feature_names_out()
print(selected_features_mutual_info)

['Age' 'Previously_Insured' 'Policy_Sales_Channel' 'Vehicle_Age'
 'Vehicle_Damage']


### Метод обёртки

In [204]:
clf = LinearRegression()
rfe = RFE(clf, n_features_to_select=n_features_select, step=1)
top_five = rfe.fit(x, y)
selected_features_RFE = top_five.get_feature_names_out()
print(selected_features_RFE)

['Driving_License' 'Previously_Insured' 'Annual_Premium'
 'Policy_Sales_Channel' 'Vehicle_Damage']


### Встроенные методы

In [205]:
selector = SelectKBest(chi2, k=n_features_select)
selector.fit_transform(x, y)
selected_features_chi2 = selector.get_feature_names_out()
print(selected_features_chi2)

['Age' 'Previously_Insured' 'Policy_Sales_Channel' 'Vehicle_Age'
 'Vehicle_Damage']


### Сравнение результатов

In [206]:
selected_dict = {
    "chi2": skb,
    "rfe": rfe,
    "mutual_inf": selector,
}

Подсчёт score для изначального фрейма

In [207]:
model = GaussianNB().fit(x_train, y_train)
masks += ([True for _ in range(x_train.shape[1])],)
scoring += (roc_auc_score(y_test, model.predict_proba(x_test)[:, 1]),)
print("Score (ROC-AUC): " + str(scoring[-1]))

Score (ROC-AUC): 0.8644920774756426


In [208]:
for selector_name, selector in selected_dict.items():
    model = GaussianNB().fit(selector.transform(x_train), y_train)
    masks += (selector.get_support(),)
    scoring = (roc_auc_score(y_test, model.predict_proba(selector.transform(x_test))[:, 1]),)
    print("Score (ROC-AUC): " + str(scoring[-1]))

Score (ROC-AUC): 0.8636157626498
Score (ROC-AUC): 0.8459675143247214
Score (ROC-AUC): 0.8636157626498


Можно увидеть, что 1 и 3 методы выбора значимых признаков дали наилучший результат
### Удаление незначимых признаков

In [209]:
selected_features = selected_features_mutual_info
train_data = train_data.loc[:, np.append(selected_features, result_col)]
test_data = test_data.loc[:, selected_features]
df_all = selected_features

x = train_data[df_all]
y = train_data[result_col]
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229292 entries, 0 to 229291
Data columns (total 6 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Age                   229292 non-null  float64 
 1   Previously_Insured    229292 non-null  float64 
 2   Policy_Sales_Channel  229292 non-null  float64 
 3   Vehicle_Age           229292 non-null  category
 4   Vehicle_Damage        229292 non-null  category
 5   Response              229292 non-null  int64   
dtypes: category(2), float64(3), int64(1)
memory usage: 7.4 MB
