In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random 
import os
import pypdf
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score,make_scorer,precision_score, recall_score,accuracy_score
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
from kerastuner.tuners import RandomSearch
import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras import backend as K
seed = 42
np.random.seed(seed)
random.seed(seed)
warnings.filterwarnings('ignore')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from kerastuner.tuners import RandomSearch


## 1.Converntional Classification Models

In [2]:
df = pd.read_csv('data_all.csv')
df.drop(['Unnamed: 0','location','title','company','extras','tag','decision_id'],axis=1,inplace=True)
df = df[df['type of insurance'].isin([
    'Animal', 
    'Legal Expenses and Monetary Loss', 
    'Health', 
    'Household and Commercial Property', 
    'Travel', 
    'Motor', 
    'Personal Belongings', 
    'Payment Protection', 
    'Others'
])]
df = df[df['premium/payment'].isin(['Yes','No'])]
df = df[df['service attitude/communication'].isin(['Yes','No'])]
df = df[df["fail to meet customer's expectations"].isin(['Yes','No'])]
df = df[df['claims processing delays'].isin(['Yes','No'])]
df = df[df['policy terms'].isin(['Yes','No'])]
df = df[df['gender'].isin(['Male','Female','Male and Female'])]
df = df[~df.apply(lambda row: row.astype(str).str.contains('not specified', case=False).any(), axis=1)]
df.replace('Yes',1,inplace=True)
df.replace('No',0,inplace=True)
df.replace('Upheld',1,inplace=True)
df.replace('Not upheld',0,inplace=True)
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.to_period('M')

In [3]:
conventional_model_df = df
X = conventional_model_df.drop(columns=['decision', 'date'])
y = conventional_model_df['decision']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

features = X_train.columns
categorical_features = ['type of insurance', 'gender']

### Logistic Regression with Lasso Regularization


In [4]:
f2_scorer = make_scorer(fbeta_score, beta=2)
log_reg_lasso_pipe = Pipeline([
    ('pre_processing', ColumnTransformer([
        ('one_hot', OneHotEncoder(drop='first'), categorical_features)
    ], remainder='passthrough')),
    ('poly', PolynomialFeatures(interaction_only=True, include_bias=False)),  # Degree will be set in GridSearchCV
    ('scaler', StandardScaler()),
    ('log_reg_lasso', LogisticRegression(penalty='l1', solver='liblinear', max_iter=10000, random_state=seed))
])

log_reg_lasso_param_grid = {
    'poly__degree': [2, 3, 4],
    'log_reg_lasso__C': [0.01, 0.1, 1, 10]
}

log_reg_lasso_grid_search = GridSearchCV(log_reg_lasso_pipe, log_reg_lasso_param_grid, cv=5, scoring=f2_scorer)
log_reg_lasso_grid_search.fit(X_train, y_train)


In [5]:
log_reg_lasso_best_params = log_reg_lasso_grid_search.best_params_
best_model = log_reg_lasso_grid_search.best_estimator_
log_reg = best_model.named_steps['log_reg_lasso']
coefficients = log_reg.coef_[0]
preprocessor = best_model.named_steps['pre_processing']
poly = best_model.named_steps['poly']
scaler = best_model.named_steps['scaler']
categorical_feature_names = preprocessor.transformers_[0][1].get_feature_names_out(categorical_features)
numeric_feature_names = X_train.drop(columns=categorical_features).columns
numeric_feature_names

all_feature_names = np.hstack((categorical_feature_names, numeric_feature_names))
poly_feature_names = poly.get_feature_names_out(all_feature_names)

feature_coef_df = pd.DataFrame({
    'Feature': poly_feature_names,
    'Coefficient': coefficients
})

feature_coef_df['abs_coef'] = np.abs(feature_coef_df['Coefficient'])
feature_coef_df.sort_values(by='abs_coef', ascending=False, inplace=True)
feature_coef_df.drop(columns='abs_coef', inplace=True)

In [6]:
y_pred = log_reg_lasso_grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"For Logistic Regression, the accuracy on the test set is: {accuracy}")
recall = recall_score(y_test, y_pred)
print(f"For Logistic Regression, the recall on the test set is: {recall}")
precision = precision_score(y_test, y_pred)
print(f"For Logistic Regression, the precision on the test set is: {precision}")
f2 = fbeta_score(y_test, y_pred, beta=2)
print(f"For Logistic Regression, the F-2 score on the test set is: {f2}")

For Logistic Regression, the accuracy on the test set is: 0.6597222222222222
For Logistic Regression, the recall on the test set is: 0.8587786259541985
For Logistic Regression, the precision on the test set is: 0.6119673617407072
For Logistic Regression, the F-2 score on the test set is: 0.7946785966564633


### Random Forest

In [7]:
rf_pipe = Pipeline([
    ('pre_processing', ColumnTransformer([
        ('one_hot', OneHotEncoder(drop='first'), categorical_features)
    ], remainder='passthrough')),
    ('poly', PolynomialFeatures(interaction_only=True, include_bias=False)),  # Degree will be set in GridSearchCV
    ('scaler', StandardScaler()), 
    ('rf', RandomForestClassifier(random_state=seed))
])

rf_param_grid = {
    'poly__degree': [2, 3, 4], 
    'rf__n_estimators': [100, 200, 300], 
    'rf__max_depth': [4, 6, 8, 10, 12],
    'rf__criterion': ['gini']
}

rf_grid_search = GridSearchCV(estimator=rf_pipe, param_grid=rf_param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

In [8]:
y_pred = rf_grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"For Random Forest, the accuracy on the test set is: {accuracy}")
recall = recall_score(y_test, y_pred)
print(f"For Random Forest, the recall on the test set is: {recall}")
precision = precision_score(y_test, y_pred)
print(f"For Random Forest, the precision on the test set is: {precision}")
f2 = fbeta_score(y_test, y_pred, beta=2)
print(f"For Random Forest, the F-2 score on the test set is: {f2}")

For Random Forest, the accuracy on the test set is: 0.6628787878787878
For Random Forest, the recall on the test set is: 0.8015267175572519
For Random Forest, the precision on the test set is: 0.625
For Random Forest, the F-2 score on the test set is: 0.7586705202312138


## 2. Deep Learning Model(Neural Network)

In [9]:
nn_df = pd.read_csv('embeddings_num_df.csv')
nn_df.dropna(inplace=True)
X = nn_df.drop(columns=['decision'])
y = nn_df['decision']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train_nn,X_val_nn,y_train_nn,y_val_nn = train_test_split(X_train,y_train,test_size=0.2,random_state=seed)

In [10]:
def build(hp):
    input_layer = Input(shape=(X_train_nn.shape[1],))
    hidden_layer1 = Dense(units=hp.Int('units_1', min_value=16, max_value=256, step=16),
                    activation=hp.Choice('activation_1', values=['sigmoid','relu']))(input_layer)
    hidden_layer2 = Dense(units=hp.Int('units_2', min_value=16, max_value=256, step=16),
                    activation=hp.Choice('activation_2', values=['sigmoid','relu']))(hidden_layer1)
    hidden_layer3 = Dense(units=hp.Int('units_3', min_value=16, max_value=256, step=16),
                    activation=hp.Choice('activation_3', values=['sigmoid','relu']))(hidden_layer2)
    output = Dense(1, activation='sigmoid')(hidden_layer3)
    model_nn = Model(inputs=input_layer, outputs=output)
    adm_optimizer = Adam(learning_rate = 0.001)
    model_nn.compile(optimizer=adm_optimizer, loss='binary_crossentropy')
    return model_nn

tuner = RandomSearch(
    build,
    objective='val_loss',
    max_trials=20,
    executions_per_trial=2,
    directory='nn_tuning',
    project_name='nn_tuning'
)
# The model is already trained, so we don't need to train it again, we can just load the best model

# tuner.search(X_train_nn,
#                y_train_nn,
#                epochs=20,
#                batch_size=128,
#                validation_data=(X_val_nn, y_val_nn),
#                verbose=1)

best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters: ", best_hyperparameters.values)
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)
best_nn_model = build(best_hyperparameters)
best_nn_model.fit(X_train, y_train, epochs=20, batch_size=128, verbose=1)

Reloading Tuner from nn_tuning\nn_tuning\tuner0.json
Best hyperparameters:  {'units_1': 16, 'activation_1': 'sigmoid', 'units_2': 256, 'activation_2': 'relu', 'units_3': 96, 'activation_3': 'relu'}
Epoch 1/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.6935  
Epoch 2/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6916
Epoch 3/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6915  
Epoch 4/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6914
Epoch 5/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6912
Epoch 6/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6910
Epoch 7/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6907
Epoch 8/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6902
Epoch 

<keras.src.callbacks.history.History at 0x26f57173850>

In [11]:
y_pred = best_nn_model.predict(X_test)
y_pred = np.where(y_pred >= 0.5, 1, 0)
accuracy = accuracy_score(y_test, y_pred)
print(f"For Neural Network, the accuracy on the test set is: {accuracy}")
recall = recall_score(y_test, y_pred)
print(f"For Neural Network, the recall on the test set is: {recall}")
precision = precision_score(y_test, y_pred)
print(f"For Neural Network, the precision on the test set is: {precision}")
f2 = fbeta_score(y_test, y_pred, beta=2)
print(f"For Neural Network, the F-2 score on the test set is: {f2}")

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
For Neural Network, the accuracy on the test set is: 0.5134831460674157
For Neural Network, the recall on the test set is: 0.42531356898517675
For Neural Network, the precision on the test set is: 0.507482993197279
For Neural Network, the F-2 score on the test set is: 0.43954748998350224
