In [9]:
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import numpy as np

In [18]:
df = pd.read_csv('step_04/dataset_minmax.csv')
response = pd.read_csv('step_04/response.csv')

## Separación en ( train (80) + val (20) )[90] + test [10]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df, response, test_size=0.1, random_state=255)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=255)

In [25]:
if not os.path.exists("data_separated"):
    os.mkdir("data_separated")

X_train.to_csv("data_separated/train_data.csv", index=False)
X_val.to_csv("data_separated/val_data.csv", index=False)
X_test.to_csv("data_separated/test_data.csv", index=False)
y_train.to_csv("data_separated/y_train.csv", index=False)
y_val.to_csv("data_separated/y_val.csv", index=False)
y_test.to_csv("data_separated/y_test.csv", index=False)

# Model testing

In [20]:
from joblib import dump

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, matthews_corrcoef, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

In [21]:
class Models():
    knn_model = KNeighborsClassifier()
    dt_model = DecisionTreeClassifier()
    svm_model = SVC()
    rf_model = RandomForestClassifier()
    sgdc_model = SGDClassifier()
    ada_model = AdaBoostClassifier()
    
    def train_all(self, data, response):
        self.knn_model.fit(X=data, y=response)
        self.dt_model.fit(X=data, y=response)
        self.svm_model.fit(X=data, y=response)
        self.rf_model.fit(X=data, y=response)
        self.sgdc_model.fit(X=data, y=response)
        self.ada_model.fit(X=data, y=response)

    def save(self, sufix):
        dump(self.rf_model, f"models/rf_model_{sufix}.joblib")
        dump(self.svm_model, f"models/svm_model_{sufix}.joblib")
        dump(self.knn_model, f"models/knn_model_{sufix}.joblib")
        dump(self.dt_model, f"models/dt_model_{sufix}.joblib")
        dump(self.sgdc_model, f"models/sdgc_model_{sufix}.joblib")
        dump(self.ada_model, f"models/ada_model_{sufix}.joblib")
    
    def predict(self, X_val):
        predictions = []
        predictions.append(("knn",self.knn_model.predict(X_val)))
        predictions.append(("dt",self.dt_model.predict(X_val)))
        predictions.append(("svm",self.svm_model.predict(X_val)))
        predictions.append(("rf",self.rf_model.predict(X_val)))
        predictions.append(("sgdc",self.sgdc_model.predict(X_val)))
        predictions.append(("ada",self.ada_model.predict(X_val)))
        
        return predictions

    def get_metrics(self, X_val,y_true):
        predictions = self.predict(X_val)
        for name, y_pred in predictions:
            acc_value = accuracy_score(y_pred=y_pred, y_true=y_true) 
            recall_value = recall_score(y_pred=y_pred, y_true=y_true)
            precision_value = precision_score(y_pred=y_pred, y_true=y_true) 
            f1_value = f1_score(y_pred=y_pred, y_true=y_true)
            mcc_value = matthews_corrcoef(y_pred=y_pred, y_true=y_true)
            cm = confusion_matrix(y_pred=y_pred, y_true=y_true)
            print(f"{name} - acc: {acc_value}, recall: {recall_value}, precision: {precision_value}, f1: {f1_value}, mcc: {mcc_value}")
            print(f"{cm}\n")
    
        #return pd.DataFrame(
        #    [[self.scaler, self.algorithm, acc_value, recall_value, precision_value, f1_value, mcc_value, cm]],
        #    columns=["scaler", "algorithm", "acc", "recall", "precision", "f1", "mcc", "cm"]
        #)

In [22]:
model_instance = Models()

In [23]:
model_instance.train_all(df, response)

  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [24]:
model_instance.get_metrics(X_val, y_val)

knn - acc: 0.7887323943661971, recall: 0.7244897959183674, precision: 0.797752808988764, f1: 0.7593582887700535, mcc: 0.5739578806327564
[[97 18]
 [27 71]]

dt - acc: 1.0, recall: 1.0, precision: 1.0, f1: 1.0, mcc: 1.0
[[115   0]
 [  0  98]]

svm - acc: 0.9154929577464789, recall: 0.8979591836734694, precision: 0.9166666666666666, f1: 0.9072164948453608, mcc: 0.8297940905174076
[[107   8]
 [ 10  88]]

rf - acc: 1.0, recall: 1.0, precision: 1.0, f1: 1.0, mcc: 1.0
[[115   0]
 [  0  98]]

sgdc - acc: 0.8497652582159625, recall: 0.9183673469387755, precision: 0.7894736842105263, f1: 0.8490566037735849, mcc: 0.7091684706670041
[[91 24]
 [ 8 90]]

ada - acc: 0.8544600938967136, recall: 0.8469387755102041, precision: 0.8383838383838383, f1: 0.8426395939086294, mcc: 0.7073064379233173
[[99 16]
 [15 83]]



In [27]:
# Perform cross-validation on the RandomForestClassifier
rf_model = model_instance.rf_model
cv_scores = cross_val_score(rf_model, df, response.values.ravel(), cv=5)

# Print the cross-validation scores
print("Cross-validation scores for RandomForestClassifier:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

# Analyze feature importance
feature_importances = rf_model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': df.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("Top 10 important features:")
print(importance_df.head(10))

Cross-validation scores for RandomForestClassifier: [0.52742616 0.62869198 0.59322034 0.52966102 0.61864407]
Mean cross-validation score: 0.5795287134377458
Top 10 important features:
      Feature  Importance
632     palb2    0.016306
636      cdh1    0.007198
1092   hsd3b2    0.006508
823     mapk1    0.006201
1078     hes6    0.005730
635       atm    0.005715
964      ctcf    0.005186
781      dph1    0.005081
640     stk11    0.004900
810   izumo1r    0.004686
