In [54]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

df = pd.read_csv("df_cleaned.csv")


y = df["BRCAstatesNew"]
X = df.drop(["BRCAstatesNew"], axis =1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [2]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [3]:
mlpc_tuned =  MLPClassifier(solver = "adam", activation= "logistic", alpha=0.001, hidden_layer_sizes=(100,100,100)).fit(X_train_scaled, y_train)

In [4]:
from catboost import CatBoostClassifier
catb_tuned = CatBoostClassifier(iterations = 100, learning_rate = 0.1, depth= 8).fit(X_train, y_train)

0:	learn: 1.4341892	total: 201ms	remaining: 19.9s
1:	learn: 1.1731409	total: 243ms	remaining: 11.9s
2:	learn: 0.9957901	total: 279ms	remaining: 9.02s
3:	learn: 0.8520169	total: 315ms	remaining: 7.55s
4:	learn: 0.7460226	total: 351ms	remaining: 6.67s
5:	learn: 0.6527086	total: 387ms	remaining: 6.06s
6:	learn: 0.5771133	total: 423ms	remaining: 5.62s
7:	learn: 0.5093634	total: 461ms	remaining: 5.3s
8:	learn: 0.4554925	total: 494ms	remaining: 5s
9:	learn: 0.4070857	total: 527ms	remaining: 4.74s
10:	learn: 0.3638837	total: 560ms	remaining: 4.53s
11:	learn: 0.3249048	total: 571ms	remaining: 4.19s
12:	learn: 0.2920385	total: 605ms	remaining: 4.05s
13:	learn: 0.2630651	total: 639ms	remaining: 3.92s
14:	learn: 0.2372707	total: 672ms	remaining: 3.81s
15:	learn: 0.2146019	total: 707ms	remaining: 3.71s
16:	learn: 0.1958251	total: 741ms	remaining: 3.62s
17:	learn: 0.1777140	total: 773ms	remaining: 3.52s
18:	learn: 0.1612204	total: 806ms	remaining: 3.44s
19:	learn: 0.1461168	total: 818ms	remaining: 

In [5]:
cart_tuned = DecisionTreeClassifier(max_depth=5, min_samples_split=3).fit(X_train, y_train)

In [6]:
gbm_tuned = GradientBoostingClassifier(learning_rate = 0.1, n_estimators= 100, max_depth=3 ).fit(X_train, y_train)

In [7]:
knn_tuned = KNeighborsClassifier(n_neighbors= 1).fit(X_train_scaled, y_train)

In [8]:
from lightgbm import LGBMClassifier
lgbm_tuned = LGBMClassifier(learning_rate= 0.1, max_depth= 2, n_estimators= 200).fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1322
[LightGBM] [Info] Number of data points in the train set: 2325, number of used features: 102
[LightGBM] [Info] Start training from score -0.186200
[LightGBM] [Info] Start training from score -2.682571
[LightGBM] [Info] Start training from score -2.947454
[LightGBM] [Info] Start training from score -7.751475
[LightGBM] [Info] Start training from score -3.097515
[LightGBM] [Info] Start training from score -6.142037
[LightGBM] [Info] Start training from score -6.652863


In [9]:
loj_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter = 1000).fit(X_train_scaled,y_train)


In [12]:
rf_tuned = RandomForestClassifier(max_features=48, min_samples_split=2,n_estimators=100).fit(X_train, y_train)

In [13]:
svm_tuned = SVC(kernel="linear", C = 1).fit(X_train_scaled,  y_train)

In [14]:
from xgboost import XGBClassifier
xgb_tuned = XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators= 100, subsample= 0.6).fit(X_train, y_train)

In [56]:
models = [knn_tuned,
         loj_model,
         svm_tuned,
         mlpc_tuned,
         cart_tuned,
         rf_tuned,
         gbm_tuned,
         catb_tuned,
         lgbm_tuned,
         xgb_tuned]
result = []
results = pd.DataFrame(columns = ["Models", "Train Accuracy", "Test Accuracy", "F1-score" ])

if 'X_test_scaled' in globals():
    X_test_used = X_test_scaled
else:
    X_test_used = X_test

if 'X_train_scaled' in globals():
    X_train_used = X_train_scaled
else:
    X_train_used = X_train
              
for model in models:
    names = model.__class__.__name__
    y_train_pred = model.predict(X_train_used)
    y_test_pred = model.predict(X_test_used)
    train_acc = accuracy_score(y_train, y_train_pred) * 100
    test_acc = accuracy_score(y_test, y_test_pred) * 100
    f1 = f1_score(y_test, y_test_pred, average='weighted') * 100
    result = pd.DataFrame([[names, train_acc, test_acc, f1 ]], columns = ["Models", "Train Accuracy", "Test Accuracy", "F1-score" ])
    results = pd.concat([results, result], ignore_index=True)



In [58]:
results

Unnamed: 0,Models,Train Accuracy,Test Accuracy,F1-score
0,KNeighborsClassifier,100.0,95.386158,95.022324
1,LogisticRegression,100.0,99.799398,99.733448
2,SVC,100.0,99.899699,99.850026
3,MLPClassifier,99.827957,99.699097,99.586298
4,DecisionTreeClassifier,100.0,100.0,100.0
5,RandomForestClassifier,99.956989,99.899699,99.849915
6,GradientBoostingClassifier,94.408602,94.48345,94.386797
7,CatBoostClassifier,99.956989,99.899699,99.850026
8,LGBMClassifier,99.870968,99.699097,99.599274
9,XGBClassifier,99.698925,99.598796,99.401541
