In [1]:
import os, sys
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import RFE

from numpy import set_printoptions
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB



In [2]:
exps_dir = "../exps"
if os.path.exists(exps_dir) == False: # tạo thư mục (nếu chưa có)
  os.makedirs(exps_dir, exist_ok=True)

save_dir = f"{exps_dir}/feature1"
os.makedirs(save_dir, exist_ok=True)
test_size=0.33
seed=42

In [6]:
x_train=pd.read_excel(f'{save_dir}/x_train.xlsx')
y_train=pd.read_excel(f'{save_dir}/y_train.xlsx')
x_test=pd.read_excel(f'{save_dir}/x_test.xlsx')
y_test=pd.read_excel(f'{save_dir}/y_test.xlsx')


In [7]:
def modelcheck_info(model,name):
    print(name)
    try:
        coefficients = model.coef_.flatten()
    except:
        coefficients = model.feature_importances_.flatten()
    feature_importance = pd.DataFrame({
        'Feature': x_train.columns,
        'Coefficient': coefficients
    })

    print('bi loai bo',feature_importance[feature_importance['Coefficient'] == 0])
    feature_importance = feature_importance[feature_importance['Coefficient'] != 0]
    feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
    feature_importance.sort_values(by='Absolute Coefficient', ascending=False, inplace=True)

    print("Xếp hạng các đặc trưng theo mức độ quan trọng:")
    print(feature_importance[['Feature', 'Coefficient']])

In [8]:
models=[]
models.append(LogisticRegression())
models.append(LinearDiscriminantAnalysis())
models.append(DecisionTreeClassifier())
names=['LR','LDA','DTC']
feature_SelectFromModel={}
for model,name in zip(models,names):
    check = model.fit(x_train, y_train)
    selector = SelectFromModel(check, prefit=True)

    selected_features = x_train.columns[(selector.get_support())]
    modelcheck_info(model,name)
    print("Các đặc trưng được chọn:")
    feature_SelectFromModel[name]=list(selected_features)
    print(selected_features)

LR
bi loai bo           Feature  Coefficient
7  umbrella_limit          0.0
Xếp hạng các đặc trưng theo mức độ quan trọng:
                        Feature  Coefficient
18            incident_severity    -0.852062
21                incident_city    -0.222564
20               incident_state    -0.179496
25              property_damage     0.169052
1                           age     0.102278
11           insured_occupation    -0.091011
23     incident_hour_of_the_day     0.084155
27                    witnesses     0.082389
4                    policy_csl    -0.074485
10      insured_education_level    -0.074129
33                    auto_make     0.069970
31               property_claim     0.066588
32                vehicle_claim     0.065104
3                  policy_state     0.064125
29           total_claim_amount     0.058318
9                   insured_sex     0.058301
2                 policy_number    -0.043467
39            day_incident_date    -0.038312
35                    

In [9]:
def feature_selection_inRFE(model,name):
    
    rfe=RFE(estimator=model, n_features_to_select=10)
    fit=rfe.fit(x_train, y_train)
#     print("Num Features: %d"% fit.n_features_)
#     print("Selected Features: %s" % fit.support_)
#     print("Features Ranking: %s" %fit.ranking_)
#     for index, value in enumerate(fit.ranking_):
#         print(f"{value}: {x_train.columns[index]}")
    sorted_columns = [x for _, x in sorted(zip(fit.ranking_, x_train.columns))]
    sort=sorted(fit.ranking_)

#     for i,j in zip(sort,sorted_columns):
#         print("Columns sorted by RFE ranking",i,":",j)

    return [j for i,j in zip(sort,sorted_columns) if i==1]

In [10]:
models=[]
models.append(LogisticRegression())
models.append(LinearDiscriminantAnalysis())
models.append(DecisionTreeClassifier())
models.append(RandomForestClassifier())
feature_RFE={}

names=['LR','LDA','DTC','RD']
for model,name in zip(models,names):
    print("RFE")
    print(f"Kiểm tra mô hình {name}")
    model.fit(x_train,y_train)
    print(feature_selection_inRFE(model,name))
    feature_RFE[name]=feature_selection_inRFE(model,name)

RFE
Kiểm tra mô hình LR
['incident_hour_of_the_day', 'incident_severity', 'injury_claim', 'insured_sex', 'insured_zip', 'policy_number', 'property_claim', 'property_damage', 'total_claim_amount', 'vehicle_claim']
RFE
Kiểm tra mô hình LDA
['incident_severity', 'injury_claim', 'insured_sex', 'insured_zip', 'policy_annual_premium', 'policy_number', 'property_claim', 'property_damage', 'total_claim_amount', 'vehicle_claim']
RFE
Kiểm tra mô hình DTC
['auto_model', 'incident_location', 'incident_severity', 'injury_claim', 'insured_hobbies', 'insured_zip', 'month_policy_bind_date', 'policy_annual_premium', 'policy_number', 'total_claim_amount']
RFE
Kiểm tra mô hình RD
['day_incident_date', 'incident_location', 'incident_severity', 'injury_claim', 'insured_hobbies', 'insured_zip', 'policy_annual_premium', 'policy_number', 'property_claim', 'vehicle_claim']


In [12]:
special_models=[KNeighborsClassifier(),GaussianNB(),MLPClassifier(),GradientBoostingClassifier(),MultinomialNB(),SVC()]
names=['KNC','GNB','NN','BG','NB','SVM']
def feature_selection_inKBest(model, name):
    k_best = SelectKBest(score_func=f_classif, k=11)
    fit = k_best.fit(x_train, y_train)
    selected_features = fit.get_support()
    selected_column_names = x_train.columns[selected_features]
    return selected_column_names
feature_Kbest={}
for model,name in zip(special_models,names):
    print(f'Lua chon dac trung {name}')
    print(feature_selection_inKBest(model, name))
    feature_Kbest[name]=list(feature_selection_inKBest(model, name))

Lua chon dac trung KNC
Index(['age', 'policy_number', 'insured_sex', 'insured_occupation',
       'incident_severity', 'incident_city', 'property_damage',
       'total_claim_amount', 'property_claim', 'vehicle_claim',
       'day_incident_date'],
      dtype='object')
Lua chon dac trung GNB
Index(['age', 'policy_number', 'insured_sex', 'insured_occupation',
       'incident_severity', 'incident_city', 'property_damage',
       'total_claim_amount', 'property_claim', 'vehicle_claim',
       'day_incident_date'],
      dtype='object')
Lua chon dac trung NN
Index(['age', 'policy_number', 'insured_sex', 'insured_occupation',
       'incident_severity', 'incident_city', 'property_damage',
       'total_claim_amount', 'property_claim', 'vehicle_claim',
       'day_incident_date'],
      dtype='object')
Lua chon dac trung BG
Index(['age', 'policy_number', 'insured_sex', 'insured_occupation',
       'incident_severity', 'incident_city', 'property_damage',
       'total_claim_amount', 'propert

In [13]:
print(feature_Kbest)
print(feature_SelectFromModel)
print(feature_RFE)

{'KNC': ['age', 'policy_number', 'insured_sex', 'insured_occupation', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'GNB': ['age', 'policy_number', 'insured_sex', 'insured_occupation', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'NN': ['age', 'policy_number', 'insured_sex', 'insured_occupation', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'BG': ['age', 'policy_number', 'insured_sex', 'insured_occupation', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'NB': ['age', 'policy_number', 'insured_sex', 'insured_occupation', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'property_claim', 'vehicle_claim',

In [18]:
feature_model_selection={**feature_RFE,**feature_Kbest}
np.savez(f'{save_dir}/feature_model_selection.npz', feature_model_selection = feature_model_selection)
feature_model_selection

{'LR': ['incident_hour_of_the_day',
  'incident_severity',
  'injury_claim',
  'insured_sex',
  'insured_zip',
  'policy_number',
  'property_claim',
  'property_damage',
  'total_claim_amount',
  'vehicle_claim'],
 'LDA': ['incident_severity',
  'injury_claim',
  'insured_sex',
  'insured_zip',
  'policy_annual_premium',
  'policy_number',
  'property_claim',
  'property_damage',
  'total_claim_amount',
  'vehicle_claim'],
 'DTC': ['auto_model',
  'incident_location',
  'incident_severity',
  'injury_claim',
  'insured_hobbies',
  'insured_zip',
  'month_policy_bind_date',
  'policy_annual_premium',
  'policy_number',
  'total_claim_amount'],
 'RD': ['day_incident_date',
  'incident_location',
  'incident_severity',
  'injury_claim',
  'insured_hobbies',
  'insured_zip',
  'policy_annual_premium',
  'policy_number',
  'property_claim',
  'total_claim_amount'],
 'KNC': ['age',
  'policy_number',
  'insured_sex',
  'insured_occupation',
  'incident_severity',
  'incident_city',
  'prope

In [19]:
dict(np.load(f'{exps_dir}/feature1/feature_model_selection.npz',allow_pickle=True))['feature_model_selection']

array({'LR': ['incident_hour_of_the_day', 'incident_severity', 'injury_claim', 'insured_sex', 'insured_zip', 'policy_number', 'property_claim', 'property_damage', 'total_claim_amount', 'vehicle_claim'], 'LDA': ['incident_severity', 'injury_claim', 'insured_sex', 'insured_zip', 'policy_annual_premium', 'policy_number', 'property_claim', 'property_damage', 'total_claim_amount', 'vehicle_claim'], 'DTC': ['auto_model', 'incident_location', 'incident_severity', 'injury_claim', 'insured_hobbies', 'insured_zip', 'month_policy_bind_date', 'policy_annual_premium', 'policy_number', 'total_claim_amount'], 'RD': ['day_incident_date', 'incident_location', 'incident_severity', 'injury_claim', 'insured_hobbies', 'insured_zip', 'policy_annual_premium', 'policy_number', 'property_claim', 'total_claim_amount'], 'KNC': ['age', 'policy_number', 'insured_sex', 'insured_occupation', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'property_claim', 'vehicle_claim', 'day_inciden