In [1]:
import os, sys
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.go_offline()
from pandas_profiling import ProfileReport

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import RFE

from numpy import set_printoptions
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB



In [2]:
exps_dir = "../../exps"
if os.path.exists(exps_dir) == False: # tạo thư mục (nếu chưa có)
  os.makedirs(exps_dir, exist_ok=True)

save_dir = f"{exps_dir}/feature1"
os.makedirs(save_dir, exist_ok=True)
test_size=0.33
seed=42

In [3]:
x_train=pd.read_excel(f'{save_dir}/x_train.xlsx')
y_train=pd.read_excel(f'{save_dir}/y_train.xlsx')
x_test=pd.read_excel(f'{save_dir}/x_test.xlsx')
y_test=pd.read_excel(f'{save_dir}/y_test.xlsx')


In [4]:
def modelcheck_info(model,name):
    print(name)
    try:
        coefficients = model.coef_.flatten()
    except:
        coefficients = model.feature_importances_.flatten()
    feature_importance = pd.DataFrame({
        'Feature': x_train.columns,
        'Coefficient': coefficients
    })

    print('bi loai bo',feature_importance[feature_importance['Coefficient'] == 0])
    feature_importance = feature_importance[feature_importance['Coefficient'] != 0]
    feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
    feature_importance.sort_values(by='Absolute Coefficient', ascending=False, inplace=True)

    print("Xếp hạng các đặc trưng theo mức độ quan trọng:")
    print(feature_importance[['Feature', 'Coefficient']])

In [5]:
models=[]
models.append(LogisticRegression())
models.append(LinearDiscriminantAnalysis())
models.append(DecisionTreeClassifier())
names=['LR','LDA','DTC']
feature_SelectFromModel={}
for model,name in zip(models,names):
    check = model.fit(x_train, y_train)
    selector = SelectFromModel(check, prefit=True)

    selected_features = x_train.columns[(selector.get_support())]
    modelcheck_info(model,name)
    print("Các đặc trưng được chọn:")
    feature_SelectFromModel[name]=list(selected_features)
    print(selected_features)

LR
bi loai bo                Feature  Coefficient
38  year_incident_date          0.0
Xếp hạng các đặc trưng theo mức độ quan trọng:
                        Feature  Coefficient
16            incident_severity    -1.482669
7                   insured_sex     0.421324
27                 injury_claim    -0.283289
22              property_damage     0.261911
6                umbrella_limit     0.207163
36            day_incident_date    -0.155976
29                vehicle_claim     0.152268
19                incident_city    -0.124086
28               property_claim     0.111541
15               collision_type    -0.110525
37          month_incident_date     0.109878
23              bodily_injuries     0.109132
21  number_of_vehicles_involved    -0.104776
0            months_as_customer    -0.104593
25      police_report_available    -0.101391
2                  policy_state     0.086036
14                incident_type     0.085589
17        authorities_contacted     0.080308
26          

In [6]:
def feature_selection_inRFE(model,name):
    
    rfe=RFE(estimator=model, n_features_to_select=10)
    fit=rfe.fit(x_train, y_train)
#     print("Num Features: %d"% fit.n_features_)
#     print("Selected Features: %s" % fit.support_)
#     print("Features Ranking: %s" %fit.ranking_)
#     for index, value in enumerate(fit.ranking_):
#         print(f"{value}: {x_train.columns[index]}")
    sorted_columns = [x for _, x in sorted(zip(fit.ranking_, x_train.columns))]
    sort=sorted(fit.ranking_)

#     for i,j in zip(sort,sorted_columns):
#         print("Columns sorted by RFE ranking",i,":",j)

    return [j for i,j in zip(sort,sorted_columns) if i==1]

In [7]:
models=[]
models.append(LogisticRegression())
models.append(LinearDiscriminantAnalysis())
models.append(DecisionTreeClassifier())
models.append(RandomForestClassifier())
feature_RFE={}

names=['LR','LDA','DTC','RD']
for model,name in zip(models,names):
    print("RFE")
    print(f"Kiểm tra mô hình {name}")
    model.fit(x_train,y_train)
    print(feature_selection_inRFE(model,name))
    feature_RFE[name]=feature_selection_inRFE(model,name)

RFE
Kiểm tra mô hình LR
['day_incident_date', 'incident_severity', 'injury_claim', 'insured_sex', 'month_incident_date', 'number_of_vehicles_involved', 'property_claim', 'property_damage', 'umbrella_limit', 'vehicle_claim']
RFE
Kiểm tra mô hình LDA
['collision_type', 'day_incident_date', 'incident_severity', 'incident_type', 'injury_claim', 'insured_sex', 'month_incident_date', 'property_claim', 'property_damage', 'umbrella_limit']
RFE
Kiểm tra mô hình DTC
['age', 'auto_year', 'capital-loss', 'incident_severity', 'injury_claim', 'insured_hobbies', 'policy_annual_premium', 'property_claim', 'vehicle_claim', 'year_policy_bind_date']
RFE
Kiểm tra mô hình RD
['auto_model', 'day_policy_bind_date', 'incident_severity', 'injury_claim', 'insured_hobbies', 'months_as_customer', 'policy_annual_premium', 'property_claim', 'total_claim_amount', 'vehicle_claim']


In [8]:
special_models=[KNeighborsClassifier(),GaussianNB(),MLPClassifier(),GradientBoostingClassifier(),MultinomialNB(),SVC()]
names=['KNC','GNB','NN','BG','NB','SVM']
def feature_selection_inKBest(model, name):
    k_best = SelectKBest(score_func=f_classif, k=11)
    fit = k_best.fit(x_train, y_train)
    selected_features = fit.get_support()
    selected_column_names = x_train.columns[selected_features]
    return selected_column_names
feature_Kbest={}
for model,name in zip(special_models,names):
    print(f'Lua chon dac trung {name}')
    print(feature_selection_inKBest(model, name))
    feature_Kbest[name]=list(feature_selection_inKBest(model, name))

Lua chon dac trung KNC
Index(['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity',
       'incident_city', 'property_damage', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_date'],
      dtype='object')
Lua chon dac trung GNB
Index(['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity',
       'incident_city', 'property_damage', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_date'],
      dtype='object')
Lua chon dac trung NN
Index(['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity',
       'incident_city', 'property_damage', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_date'],
      dtype='object')
Lua chon dac trung BG
Index(['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity',
       'incident_city', 'property_damage', 'total_claim_amount',
       'injury_claim', 'property_cla

In [9]:
print(feature_Kbest)
print(feature_SelectFromModel)
print(feature_RFE)

{'KNC': ['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'GNB': ['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'NN': ['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'BG': ['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_date'], 'NB': ['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'injury_claim', 'property_clai

In [10]:
feature_model_selection={**feature_RFE,**feature_Kbest}
np.savez(f'{save_dir}/feature_model_selection.npz', feature_model_selection = feature_model_selection)
feature_model_selection

{'LR': ['day_incident_date',
  'incident_severity',
  'injury_claim',
  'insured_sex',
  'month_incident_date',
  'number_of_vehicles_involved',
  'property_claim',
  'property_damage',
  'umbrella_limit',
  'vehicle_claim'],
 'LDA': ['collision_type',
  'day_incident_date',
  'incident_severity',
  'incident_type',
  'injury_claim',
  'insured_sex',
  'month_incident_date',
  'property_claim',
  'property_damage',
  'umbrella_limit'],
 'DTC': ['age',
  'auto_model',
  'capital-loss',
  'incident_severity',
  'injury_claim',
  'insured_hobbies',
  'policy_annual_premium',
  'property_claim',
  'vehicle_claim',
  'year_policy_bind_date'],
 'RD': ['auto_year',
  'incident_severity',
  'injury_claim',
  'insured_hobbies',
  'insured_occupation',
  'months_as_customer',
  'policy_annual_premium',
  'property_claim',
  'total_claim_amount',
  'vehicle_claim'],
 'KNC': ['policy_state',
  'umbrella_limit',
  'insured_sex',
  'incident_severity',
  'incident_city',
  'property_damage',
  'tota

In [11]:
dict(np.load(f'{exps_dir}/feature1/feature_model_selection.npz',allow_pickle=True))['feature_model_selection']

array({'LR': ['day_incident_date', 'incident_severity', 'injury_claim', 'insured_sex', 'month_incident_date', 'number_of_vehicles_involved', 'property_claim', 'property_damage', 'umbrella_limit', 'vehicle_claim'], 'LDA': ['collision_type', 'day_incident_date', 'incident_severity', 'incident_type', 'injury_claim', 'insured_sex', 'month_incident_date', 'property_claim', 'property_damage', 'umbrella_limit'], 'DTC': ['age', 'auto_model', 'capital-loss', 'incident_severity', 'injury_claim', 'insured_hobbies', 'policy_annual_premium', 'property_claim', 'vehicle_claim', 'year_policy_bind_date'], 'RD': ['auto_year', 'incident_severity', 'injury_claim', 'insured_hobbies', 'insured_occupation', 'months_as_customer', 'policy_annual_premium', 'property_claim', 'total_claim_amount', 'vehicle_claim'], 'KNC': ['policy_state', 'umbrella_limit', 'insured_sex', 'incident_severity', 'incident_city', 'property_damage', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'day_incident_