In [1]:
import os, sys
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import RFE

from numpy import set_printoptions
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB



# Lựa chọn đặc trưng

### (1) Tạo thư mục lưu trữ (nếu chưa có)

In [2]:
exps_dir = "../../exps"
if os.path.exists(exps_dir) == False: # tạo thư mục (nếu chưa có)
  os.makedirs(exps_dir, exist_ok=True)

save_dir = f"{exps_dir}/feature1"
os.makedirs(save_dir, exist_ok=True)
test_size=0.33
seed=42

In [3]:
x_train=pd.read_excel(f'{save_dir}/x_train.xlsx')
y_train=pd.read_excel(f'{save_dir}/y_train.xlsx')
x_test=pd.read_excel(f'{save_dir}/x_test.xlsx')
y_test=pd.read_excel(f'{save_dir}/y_test.xlsx')


### (2) Lựa chọn các đặc trưng

In [4]:
def modelcheck_info(model,name):
    print(name)
    try:
        coefficients = model.coef_.flatten()
    except:
        coefficients = model.feature_importances_.flatten()
    feature_importance = pd.DataFrame({
        'Feature': x_train.columns,
        'Coefficient': coefficients
    })

    print('bi loai bo',feature_importance[feature_importance['Coefficient'] == 0])
    feature_importance = feature_importance[feature_importance['Coefficient'] != 0]
    feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
    feature_importance.sort_values(by='Absolute Coefficient', ascending=False, inplace=True)

    print("Xếp hạng các đặc trưng theo mức độ quan trọng:")
    print(feature_importance[['Feature', 'Coefficient']])

In [5]:
models=[]
models.append(LogisticRegression())
models.append(LinearDiscriminantAnalysis())
models.append(DecisionTreeClassifier())
names=['LR','LDA','DTC']
feature_SelectFromModel={}
for model,name in zip(models,names):
    check = model.fit(x_train, y_train)
    selector = SelectFromModel(check, prefit=True)

    selected_features = x_train.columns[(selector.get_support())]
    modelcheck_info(model,name)
    print("Các đặc trưng được chọn:")
    feature_SelectFromModel[name]=list(selected_features)
    print(selected_features)

LR
bi loai bo                Feature  Coefficient
38  year_incident_date          0.0
Xếp hạng các đặc trưng theo mức độ quan trọng:
                        Feature  Coefficient
16            incident_severity    -1.903330
21  number_of_vehicles_involved    -1.080346
14                incident_type    -0.759999
1                           age    -0.479193
41                    high_hour    -0.467645
32                    auto_year    -0.373999
3                    policy_csl    -0.370457
27                 injury_claim    -0.345468
15               collision_type    -0.330579
0            months_as_customer    -0.256330
25      police_report_available    -0.253940
22              property_damage    -0.216285
28               property_claim     0.215748
36            day_incident_date    -0.203219
19                incident_city    -0.195195
6                umbrella_limit     0.183944
37          month_incident_date     0.181477
20     incident_hour_of_the_day    -0.166949
4           

### Giải thích:

Phương pháp **SelectFromModel** từ thư viện sklearn.feature_selection. Phương pháp này hoạt động bằng cách sử dụng một mô hình học máy để đánh giá độ quan trọng của từng đặc trưng và sau đó chọn những đặc trưng quan trọng nhất.

   Các model được dùng trong phương pháp này:
   * **'LR'**: LogisticRegression
   * **'LDA'**: LinearDiscriminantAnalysis
   * **'DTC'**: DecisionTreeClassifier
  
       #### 1. LogisticRegression:
       Các đặc trưng được chọn:
* Index(['age', 'policy_csl', 'insured_education_level', 'insured_occupation',
       'incident_severity', 'incident_state', 'incident_city',
       'incident_hour_of_the_day', 'property_damage', 'witnesses',
       'property_claim', 'vehicle_claim', 'auto_make'],
      dtype='object')
      
   #### 2. LinearDiscriminantAnalysis:
   Các đặc trưng được chọn:
* Index(['policy_number', 'policy_annual_premium', 'insured_zip', 'insured_sex',
       'incident_severity', 'incident_hour_of_the_day', 'property_damage',
       'total_claim_amount', 'injury_claim', 'property_claim',
       'vehicle_claim'],
      dtype='object')
      
   #### 3. DecisionTreeClassifier:
   Các đặc trưng được chọn:
* Index(['policy_number', 'policy_annual_premium', 'insured_zip',
       'insured_occupation', 'insured_hobbies', 'incident_severity',
       'incident_location', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'month_policy_bind_date', 'day_incident_date'],
      dtype='object')

In [6]:
def feature_selection_inRFE(model,name):
    rfe=RFE(estimator=model, n_features_to_select=10)
    fit=rfe.fit(x_train, y_train)
#     print("Num Features: %d"% fit.n_features_)
#     print("Selected Features: %s" % fit.support_)
#     print("Features Ranking: %s" %fit.ranking_)
#     for index, value in enumerate(fit.ranking_):
#         print(f"{value}: {x_train.columns[index]}")
    sorted_columns = [x for _, x in sorted(zip(fit.ranking_, x_train.columns))]
    sort=sorted(fit.ranking_)

#     for i,j in zip(sort,sorted_columns):
#         print("Columns sorted by RFE ranking",i,":",j)

    return [j for i,j in zip(sort,sorted_columns) if i==1]

In [7]:
models=[]
models.append(LogisticRegression())
models.append(LinearDiscriminantAnalysis())
models.append(DecisionTreeClassifier())
models.append(RandomForestClassifier())
feature_RFE={}

names=['LR','LDA','DTC','RD']
for model,name in zip(models,names):
    print("RFE")
    print(f"Kiểm tra mô hình {name}")
    model.fit(x_train,y_train)
    print(feature_selection_inRFE(model,name))
    feature_RFE[name]=feature_selection_inRFE(model,name)

RFE
Kiểm tra mô hình LR
['age', 'auto_year', 'collision_type', 'high_hour', 'incident_severity', 'incident_type', 'injury_claim', 'number_of_vehicles_involved', 'policy_csl', 'vehicle_claim']
RFE
Kiểm tra mô hình LDA
['age', 'auto_year', 'collision_type', 'high_hour', 'incident_severity', 'incident_type', 'months_as_customer', 'number_of_vehicles_involved', 'police_report_available', 'policy_csl']
RFE
Kiểm tra mô hình DTC
['auto_make', 'capital-loss', 'day_policy_bind_date', 'incident_severity', 'insured_hobbies', 'insured_occupation', 'month_policy_bind_date', 'vehicle_claim', 'week_incident_date', 'year_policy_bind_date']
RFE
Kiểm tra mô hình RD
['capital-loss', 'day_incident_date', 'incident_severity', 'injury_claim', 'insured_hobbies', 'month_policy_bind_date', 'policy_annual_premium', 'property_claim', 'total_claim_amount', 'vehicle_claim']


### Giải thích:
**Recursive Feature Elimination (RFE)** là một phương pháp trong lựa chọn đặc trưng (feature selection) được sử dụng trong học máy để giảm số lượng đặc trưng của mô hình mà vẫn giữ được hiệu suất tốt.

Các model được dùng trong phương pháp này:
   * **'LR'**: LogisticRegression
   * **'LDA'**: LinearDiscriminantAnalysis
   * **'DTC'**: DecisionTreeClassifier
   * **'RD'**: RandomForestClassifier

In [8]:
special_models=[KNeighborsClassifier(),GaussianNB(),MLPClassifier(),GradientBoostingClassifier(),MultinomialNB(),SVC()]
names=['KNC','GNB','NN','BG','NB','SVM']
def feature_selection_inKBest(model, name):
    k_best = SelectKBest(score_func=f_classif, k=11)
    fit = k_best.fit(x_train, y_train)
    selected_features = fit.get_support()
    selected_column_names = x_train.columns[selected_features]
    return selected_column_names
feature_Kbest={}
for model,name in zip(special_models,names):
    print(f'Lua chon dac trung {name}')
    print(feature_selection_inKBest(model, name))
    feature_Kbest[name]=list(feature_selection_inKBest(model, name))

Lua chon dac trung KNC
Index(['policy_csl', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_city', 'property_damage',
       'police_report_available', 'total_claim_amount', 'property_claim',
       'vehicle_claim', 'auto_year'],
      dtype='object')
Lua chon dac trung GNB
Index(['policy_csl', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_city', 'property_damage',
       'police_report_available', 'total_claim_amount', 'property_claim',
       'vehicle_claim', 'auto_year'],
      dtype='object')
Lua chon dac trung NN
Index(['policy_csl', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_city', 'property_damage',
       'police_report_available', 'total_claim_amount', 'property_claim',
       'vehicle_claim', 'auto_year'],
      dtype='object')
Lua chon dac trung BG
Index(['policy_csl', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_city', 'property_damag

### Giải thích:
Phương pháp **SelectKBest** từ thư viện sklearn.feature_selection để lựa chọn đặc trưng dựa trên kỹ thuật kiểm định thống kê.
 
Các model được dùng trong phương pháp này:
   * **'KNC'**: KNeighborsClassifier
   * **'GNB'**: GaussianNB
   * **'NN'**: MLPClassifier
   * **'BG'**: GradientBoostingClassifier
   * **'NB'**: MultinomialNB
   * **'SVM'**: SVC

In [9]:
print(feature_Kbest)
print(feature_SelectFromModel)
print(feature_RFE)

{'KNC': ['policy_csl', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_city', 'property_damage', 'police_report_available', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'auto_year'], 'GNB': ['policy_csl', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_city', 'property_damage', 'police_report_available', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'auto_year'], 'NN': ['policy_csl', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_city', 'property_damage', 'police_report_available', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'auto_year'], 'BG': ['policy_csl', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_city', 'property_damage', 'police_report_available', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'auto_year'], 'NB': ['policy_csl', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_city', 'property_damage', 

In [10]:
feature_model_selection={**feature_RFE,**feature_Kbest}
np.savez(f'{save_dir}/feature_model_selection.npz', feature_model_selection = feature_model_selection)
feature_model_selection

{'LR': ['age',
  'auto_year',
  'collision_type',
  'high_hour',
  'incident_severity',
  'incident_type',
  'injury_claim',
  'number_of_vehicles_involved',
  'policy_csl',
  'vehicle_claim'],
 'LDA': ['age',
  'auto_year',
  'collision_type',
  'high_hour',
  'incident_severity',
  'incident_type',
  'months_as_customer',
  'number_of_vehicles_involved',
  'police_report_available',
  'policy_csl'],
 'DTC': ['auto_make',
  'auto_model',
  'capital-loss',
  'day_policy_bind_date',
  'incident_severity',
  'insured_hobbies',
  'month_policy_bind_date',
  'policy_annual_premium',
  'vehicle_claim',
  'year_policy_bind_date'],
 'RD': ['capital-loss',
  'day_incident_date',
  'incident_severity',
  'injury_claim',
  'insured_hobbies',
  'month_policy_bind_date',
  'property_claim',
  'total_claim_amount',
  'vehicle_claim',
  'year_policy_bind_date'],
 'KNC': ['policy_csl',
  'collision_type',
  'incident_severity',
  'authorities_contacted',
  'incident_city',
  'property_damage',
  'pol

* gom nhóm thông tin về các đặc trưng được chọn từ hai phương pháp lựa chọn đặc trưng khác nhau (RFE và SelectKBest) vào từ điển **feature_model_selection**.

In [11]:
dict(np.load(f'{exps_dir}/feature1/feature_model_selection.npz',allow_pickle=True))['feature_model_selection']

array({'LR': ['age', 'auto_year', 'collision_type', 'high_hour', 'incident_severity', 'incident_type', 'injury_claim', 'number_of_vehicles_involved', 'policy_csl', 'vehicle_claim'], 'LDA': ['age', 'auto_year', 'collision_type', 'high_hour', 'incident_severity', 'incident_type', 'months_as_customer', 'number_of_vehicles_involved', 'police_report_available', 'policy_csl'], 'DTC': ['auto_make', 'auto_model', 'capital-loss', 'day_policy_bind_date', 'incident_severity', 'insured_hobbies', 'month_policy_bind_date', 'policy_annual_premium', 'vehicle_claim', 'year_policy_bind_date'], 'RD': ['capital-loss', 'day_incident_date', 'incident_severity', 'injury_claim', 'insured_hobbies', 'month_policy_bind_date', 'property_claim', 'total_claim_amount', 'vehicle_claim', 'year_policy_bind_date'], 'KNC': ['policy_csl', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_city', 'property_damage', 'police_report_available', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'a