In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import svm, metrics
import xgboost as xgb
from sklearn.metrics import mean_squared_error, roc_auc_score, recall_score, roc_curve, auc, cohen_kappa_score
from sklearn.model_selection import train_test_split, GridSearchCV
import seaborn as sns
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense
import os


df3 = pd.read_excel('/Users/dawidarakelyan/Desktop/Capstone final/testing_findal.xlsx')
df3 = pd.DataFrame(df3)

# Drop not needed columns
df3 = df3.drop(['Unnamed: 0', 'id', 'user id', 'Cont_start'], axis=1)

# plot top 10 car makes
car_make_counts = df3['car'].value_counts().head(10)
plt.figure(figsize=(10, 5))
sns.barplot(x=car_make_counts.index, y=car_make_counts.values, palette='Blues_r')
plt.title('Top 10 Car Makes')
plt.xlabel('Car Make')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# plot car classes
car_class_counts = df3['car_class'].value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(x=car_class_counts.index, y=car_class_counts.values, palette='Blues_r')
plt.title('Car Class Distribution')
plt.xlabel('Car Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# plot group distribution
group_counts = df3['group'].value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(x=group_counts.index, y=group_counts.values, palette='Blues_r')
plt.title('Group Distribution')
plt.xlabel('Group')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# plot age distribution for targets
plt.figure(figsize=(10, 5))
sns.histplot(data=df3, x='d_age', hue='Target', kde=True, palette='Blues_r')
plt.title('Age Distribution for Targets')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# separate the target variable from the features
y = df3['Target']
X = df3.drop('Target', axis=1)

# one-hot-encode categorical variables
X = pd.get_dummies(X, columns=['car', 'year', 'month', 'Region', 'Root', 'gender ', 'car_class'])

# fill any missing values with 0
X.fillna(0, inplace=True)

# Split data into training and testing sets
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=0)
for train_index, test_index in stratified_split.split(X, y):
    strat_train_X = X.iloc[train_index]
    strat_train_y = y.iloc[train_index]
    strat_test_X = X.iloc[test_index]
    strat_test_y = y.iloc[test_index]

# Create a scaler object
scaler = StandardScaler() 

# Fit the scaler on the training set
scaler.fit(strat_train_X)

# Scale the training set
X_train_st = scaler.transform(strat_train_X)

# Scale the testing set using the same scaler
X_test_st = scaler.transform(strat_test_X)    

#XGBooster

# Define the parameter grid for XGBoost
param_grid_xgb = {
    "learning_rate": [0.2],
    "max_depth": [50],
    "n_estimators": [20],
    "subsample": [0.5],
    "colsample_bytree": [1.0],
    "gamma": [0.1],
    "scale_pos_weight":[5]
}

# Create a XGBoost classifier and perform a grid search to find the best hyperparameters
model = XGBClassifier()
grid_search = GridSearchCV(model, param_grid_xgb, cv=5, scoring='recall')
grid_search.fit(X_train_st, strat_train_y)

# Print the results of the grid search
print(grid_search)

expected_y  = strat_test_y
predicted_y = grid_search.predict(X_test_st)

acc_xgb=accuracy_score(expected_y,predicted_y)
acc_xgb=acc_xgb*100
acc_xgb

print(metrics.classification_report(expected_y, predicted_y))
print(metrics.confusion_matrix(expected_y, predicted_y))

fpr, tpr, thresholds = roc_curve(expected_y, predicted_y)
kappa = cohen_kappa_score(expected_y, predicted_y)
# Compute area under the curve (AUC)
roc_auc = auc(fpr, tpr)



# Plot ROC curve and kappa score
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGB (ROC) Curve\nCohen\'s kappa = %0.2f' % kappa)
plt.legend(loc="lower right")
plt.show()

#DT

param_grid_dtc={'criterion':['gini'],
                'max_depth':[1000],
                'min_samples_split':[200],
                'min_samples_leaf': [100],
                'max_features': ['auto'],
                'splitter': ['best'],
                'max_leaf_nodes': [ 1000],
                'min_impurity_decrease': [0.0],
                'class_weight': [ 'balanced']}

# Create Decision Tree model
clf = DecisionTreeClassifier()
grid_search_dtc = GridSearchCV(estimator=clf, param_grid=param_grid_dtc, cv=10,scoring='recall')
grid_search_dtc.fit(X_train_st, strat_train_y)
print(grid_search_dtc)

expected_y_dtc  = strat_test_y
predicted_y_dtc = grid_search_dtc.predict(X_test_st)

acc_dtc=accuracy_score(expected_y_dtc,predicted_y_dtc)
acc_dtc=acc_dtc*100
acc_dtc


print(metrics.classification_report(expected_y_dtc,predicted_y_dtc))
print(metrics.confusion_matrix(expected_y_dtc,predicted_y_dtc))

fpr1, tpr1, thresholds1 = roc_curve(expected_y_dtc,predicted_y_dtc)
kappa1 = cohen_kappa_score(expected_y_dtc,predicted_y_dtc)
# Compute area under the curve (AUC)
roc_auc1 = auc(fpr1, tpr1)



# Plot ROC curve and kappa score
plt.figure()
plt.plot(fpr1, tpr1, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc1)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('DT (ROC) Curve\nCohen\'s kappa = %0.2f' % kappa1)
plt.legend(loc="lower right")
plt.show()

#RF

param_grid_rf={'n_estimators':[100],
               'max_depth':[45],
               'min_samples_split':[50],
               'min_samples_leaf': [20],
               'max_features':['sqrt']}

# Create Random Forest model
rf_model=RandomForestClassifier()
grid_search_rf=GridSearchCV(estimator=rf_model,param_grid=param_grid_rf,cv=10,scoring='recall')
grid_search_rf.fit(X_train_st, strat_train_y)
print(grid_search_rf)

expected_y_rf  = strat_test_y
predicted_y_rf = grid_search_rf.predict(X_test_st)

acc_rf=accuracy_score(expected_y_rf,predicted_y_rf)
acc_rf=acc_rf*100
acc_rf


print(metrics.classification_report(expected_y_rf,predicted_y_rf))
print(metrics.confusion_matrix(expected_y_rf,predicted_y_rf))

fpr2, tpr2, thresholds2 = roc_curve(expected_y_rf,predicted_y_rf)
kappa2 = cohen_kappa_score(expected_y_rf,predicted_y_rf)
# Compute area under the curve (AUC)
roc_auc2 = auc(fpr2, tpr2)



# Plot ROC curve and kappa score
plt.figure()
plt.plot(fpr2, tpr2, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc2)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RF (ROC) Curve\nCohen\'s kappa = %0.2f' % kappa2)
plt.legend(loc="lower right")
plt.show()

#LR

hyperparameters = {
    'penalty': ['l2'],
    'C': [0.1],
    'solver': [ 'sag'],
    'max_iter': [200]
}

# Create Logistic Regression model
lr_model = LogisticRegression()
grid_search_lr = GridSearchCV(lr_model, hyperparameters, cv=5, verbose=0,scoring='recall')
grid_search_lr.fit(X_train_st, strat_train_y)
print(grid_search_lr)

expected_y_lr  = strat_test_y
predicted_y_lr = grid_search_lr.predict(X_test_st)

acc_lr=accuracy_score(expected_y_lr,predicted_y_lr)
acc_lr=acc_lr*100
acc_lr

print(metrics.classification_report(expected_y_lr,predicted_y_lr))
print(metrics.confusion_matrix(expected_y_lr,predicted_y_lr))

fpr3, tpr3, thresholds3 = roc_curve(expected_y_lr,predicted_y_lr)
kappa3 = cohen_kappa_score(expected_y_lr,predicted_y_lr)
# Compute area under the curve (AUC)
roc_auc3 = auc(fpr3, tpr3)



# Plot ROC curve and kappa score
plt.figure()
plt.plot(fpr3, tpr3, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc3)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LR (ROC) Curve\nCohen\'s kappa = %0.2f' % kappa3)
plt.legend(loc="lower right")
plt.show()

#ANN


# define the neural network model architecture
classifier = Sequential()
classifier.add(Dense(units=350, kernel_initializer="uniform", activation="relu", input_dim=148))
classifier.add(Dense(units = 160, kernel_initializer="uniform", activation="relu"))
classifier.add(Dense(units = 1, kernel_initializer="uniform", activation="sigmoid"))

# compile the model
classifier.compile(optimizer = "Adagrad", loss = "binary_crossentropy", metrics=['Recall'])

# train the model
classifier.fit(X_train_st, strat_train_y, batch_size=64, epochs=45)

# make predictions on the test set
Y_predict = classifier.predict(X_test_st)
expected_y_ann = strat_test_y
Y_predict = (Y_predict > 0.5)

print(metrics.classification_report(expected_y_ann,Y_predict))
print(metrics.confusion_matrix(expected_y_ann,Y_predict))

fpr4, tpr4, thresholds4 = roc_curve(expected_y_ann,Y_predict)
kappa4 = cohen_kappa_score(expected_y_ann,Y_predict)
# Compute area under the curve (AUC)
roc_auc4 = auc(fpr4, tpr4)



# Plot ROC curve and kappa score
plt.figure()
plt.plot(fpr4, tpr4, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc4)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ANN (ROC) Curve\nCohen\'s kappa = %0.2f' % kappa4)
plt.legend(loc="lower right")
plt.show()




xgb_report = metrics.classification_report(expected_y, predicted_y, output_dict=True)
dtc_report = metrics.classification_report(expected_y_dtc,predicted_y_dtc, output_dict=True)
rf_report =metrics.classification_report(expected_y_rf,predicted_y_rf, output_dict=True)
lr_report = metrics.classification_report(expected_y_lr,predicted_y_lr, output_dict=True)
ann_report = metrics.classification_report(expected_y_ann,Y_predict, output_dict=True)

kappa = cohen_kappa_score(expected_y, predicted_y)
kappa1 = cohen_kappa_score(expected_y_dtc,predicted_y_dtc)
kappa2 = cohen_kappa_score(expected_y_rf,predicted_y_rf)
kappa3 = cohen_kappa_score(expected_y_lr,predicted_y_lr)
kappa4 = cohen_kappa_score(expected_y_ann,Y_predict)


# calculate AUC-ROC score for each model
xgb_auc = metrics.roc_auc_score(expected_y, predicted_y)
dtc_auc = metrics.roc_auc_score(expected_y_dtc, predicted_y_dtc)
rf_auc = metrics.roc_auc_score(expected_y_rf, predicted_y_rf)
lr_auc = metrics.roc_auc_score(expected_y_lr, predicted_y_lr)
ann_auc = metrics.roc_auc_score(expected_y_ann, Y_predict)
#svm_auc = metrics.roc_auc_score(expected_y_svm, predicted_y_svm)

# create a table with classification report and confusion matrix for each model
table_data = {
    'Model': ['XGBoost', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'ANN'],
    'Accuracy': [xgb_report['accuracy'], dtc_report['accuracy'], rf_report['accuracy'], lr_report['accuracy'], ann_report['accuracy']],
    'AUC':[xgb_auc,dtc_auc,rf_auc,lr_auc,ann_auc],
    'Kappa':[kappa,kappa1,kappa2,kappa3,kappa4],
    'Precision': [xgb_report['macro avg']['precision'], dtc_report['macro avg']['precision'], rf_report['macro avg']['precision'], lr_report['macro avg']['precision'], ann_report['macro avg']['precision']],
    'Recall': [xgb_report['macro avg']['recall'], dtc_report['macro avg']['recall'], rf_report['macro avg']['recall'], lr_report['macro avg']['recall'], ann_report['macro avg']['recall']],
    'F1-score': [xgb_report['macro avg']['f1-score'], dtc_report['macro avg']['f1-score'], rf_report['macro avg']['f1-score'], lr_report['macro avg']['f1-score'], ann_report['macro avg']['f1-score']]
    
}

# create Pandas DataFrame from the table data
table_df = pd.DataFrame(table_data)

# print the table
print(table_df)


# Define data for each model
models = [
    {'name': 'XGB', 'expected': expected_y, 'predicted': predicted_y},
    {'name': 'DT', 'expected': expected_y_dtc, 'predicted': predicted_y_dtc},
    {'name': 'RF', 'expected': expected_y_rf, 'predicted': predicted_y_rf},
    {'name': 'LR', 'expected': expected_y_lr, 'predicted': predicted_y_lr},
    {'name': 'ANN', 'expected': expected_y_ann, 'predicted': Y_predict},
]

# Define subplots
fig, axes = plt.subplots(nrows=1, ncols=len(models), figsize=(20, 5))

# Loop through each model and plot its ROC curve
for i, model in enumerate(models):
    # Calculate fpr, tpr, and thresholds
    fpr, tpr, thresholds = roc_curve(model['expected'], model['predicted'])
    # Calculate AUC and kappa score
    roc_auc = auc(fpr, tpr)
    kappa = cohen_kappa_score(model['expected'], model['predicted'])
    # Plot ROC curve and kappa score
    axes[i].plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
    axes[i].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    axes[i].set_xlim([0.0, 1.0])
    axes[i].set_ylim([0.0, 1.05])
    axes[i].set_xlabel('False Positive Rate')
    axes[i].set_ylabel('True Positive Rate')
    axes[i].set_title('%s (ROC) Curve\nCohen\'s kappa = %0.2f' % (model['name'], kappa))
    axes[i].legend(loc="lower right")

# Display the figure
plt.tight_layout()
plt.show()
