In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# HIP_CER_df = pd.read_pickle('data/HIP_CER.pkl')
# ICC_df = pd.read_pickle('data/ICC_rms.pkl')
# SIMS_df = pd.read_pickle('data/SIMS.pkl')
# BIG_df = pd.read_pickle('data/30k_data.pkl')

In [84]:
class Object:
    def __init__(self, file_path, normalize=False):
        self.data = pd.read_pickle(file_path)
        
        if normalize:
            self._normalize_data()

        if 'type' in self.data.columns:
            self.X = self.data.drop('type', axis=1).copy()
            self.y = self.data['type'].copy()
            self.labels = None
            
            self._create_class_labels()
            
            self.X_train = None
            self.y_train = None
            self.X_test = None
            self.y_test = None
        
        else:
            self.X = self.data.copy()
            self.y = None
        
        self.model = None
        self.y_pred = None

    def _normalize_data(self):
        numerical_columns = self.data.select_dtypes(include=['float64', 'int64']).columns
        print(f'Normalizing {len(numerical_columns)} columns...')
        scl= StandardScaler()
        self.data[numerical_columns] = scl.fit_transform(self.data[numerical_columns])

    def _create_class_labels(self):
        self.labels = LabelEncoder().fit_transform(self.y)

    def split_dataset(self, test_size=0.20):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.labels, stratify=self.labels, test_size=test_size, random_state=42)
        print(self.X_train.shape, self.X_test.shape)

    def tune_xgboost(self):
        # Define the parameter grid for RandomizedSearchCV
        param_grid = {
            'n_estimators': [100, 300, 600, 800],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5, 8, 12],
            'subsample': np.arange(0.5, 1.0, 0.1),
            'colsample_bytree': np.arange(0.5, 1.0, 0.1),
            'gamma': [0.01, 0.1, 0, 1, 2]
        }

        # Create a RandomizedSearchCV object
        random_search = RandomizedSearchCV(
            estimator=XGBClassifier(),
            param_distributions=param_grid,
            n_iter=10,  # Number of parameter settings that are sampled
            scoring='accuracy',
            n_jobs=-1,  # Use all available cores
            cv=3,  # Number of cross-validation 
            verbose=999,
            random_state=42
        )

        # Fit the RandomizedSearchCV object to the training data
        random_search.fit(self.X_train, self.y_train)
        # Get the best parameter combination found by RandomizedSearchCV
        best_params = random_search.best_params_
        # Create a new XGBoost classifier with the best parameters
        self.model = XGBClassifier(**best_params)
        # Train the classifier on the training data
        self.model.fit(self.X_train, self.y_train)
        # Make predictions on the test data
        self.y_pred = self.model.predict(self.X_test)
        # Evaluate the accuracy of the classifier
        accuracy = accuracy_score(self.y_test, self.y_pred)
        print("Accuracy:", accuracy)

        
    def __str__(self) -> str:
        return f'Shape of dataset: {self.data.shape}'

MyData = Object('data/ICC_rms.pkl')     
MyData.split_dataset(test_size=.2)

(1235, 511) (309, 511)


## ANN 

In [85]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the neural network model
class BinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.linear(out)
        out = self.sigmoid(out)
        return out

In [86]:
# Convert the training and testing sets to tensors
X_train = torch.tensor(MyData.X_train.values, dtype=torch.float32)
y_train = torch.tensor(MyData.y_train, dtype=torch.float32)
X_test = torch.tensor(MyData.X_test.values, dtype=torch.float32)
y_test = torch.tensor(MyData.y_test, dtype=torch.float32)

# Reshape the labels
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [91]:
# Create an instance of the model
input_size = X_train.shape[1]  # Number of features in the input data
hidden_size = 500  # Number of units in the hidden layer
model = BinaryClassifier(input_size, hidden_size)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 3000
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Test the model
predictions = model(X_test)
y_pred = predictions.round().detach().numpy()

Epoch 100/3000, Loss: 0.11698872596025467
Epoch 200/3000, Loss: 0.020172789692878723
Epoch 300/3000, Loss: 0.006545350421220064
Epoch 400/3000, Loss: 0.0031328287441283464
Epoch 500/3000, Loss: 0.0018331216415390372
Epoch 600/3000, Loss: 0.0012096018763259053
Epoch 700/3000, Loss: 0.0008581866277381778
Epoch 800/3000, Loss: 0.0006412052898667753
Epoch 900/3000, Loss: 0.0004973167669959366
Epoch 1000/3000, Loss: 0.00039650476537644863
Epoch 1100/3000, Loss: 0.00032333636772818863
Epoch 1200/3000, Loss: 0.00026832229923456907
Epoch 1300/3000, Loss: 0.0002258243621326983
Epoch 1400/3000, Loss: 0.0001924664684338495
Epoch 1500/3000, Loss: 0.00016573506582062691
Epoch 1600/3000, Loss: 0.00014400333748199046
Epoch 1700/3000, Loss: 0.00012603589857462794
Epoch 1800/3000, Loss: 0.0001110424636863172
Epoch 1900/3000, Loss: 9.835592936724424e-05
Epoch 2000/3000, Loss: 8.762104698689654e-05
Epoch 2100/3000, Loss: 7.841119804652408e-05
Epoch 2200/3000, Loss: 7.047499093459919e-05
Epoch 2300/3000, 

In [92]:
print(accuracy_score(y_test, y_pred))

0.7184466019417476
