# Smote Method

This notebook will apply the SMOTE balancing method to the training data.

In [8]:
import pandas as pd

df_train = pd.read_csv('train.csv')
X_train = df_train.drop('Diabetes_binary', axis=1)
y_train = df_train['Diabetes_binary']

val_df = pd.read_csv('val.csv')
X_val = val_df.drop('Diabetes_binary', axis=1)
y_val = val_df['Diabetes_binary']

print("Pre-sampling class imbalance:")
print(y_train.value_counts(normalize=True))

Pre-sampling class imbalance:
0.0    0.5
1.0    0.5
Name: Diabetes_binary, dtype: float64


In [9]:
import pandas as pd
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=92)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print("Class distribution after SMOTE:")
print(pd.Series(y_smote).value_counts(normalize=True))

columns = X_train.columns 
df_smote = pd.DataFrame(X_smote, columns=columns)
df_smote['Diabetes_binary'] = y_smote  

df_smote.to_csv('train.csv', index=False)

print("New training set saved as 'train.csv'.")


Class distribution after SMOTE:
0.0    0.5
1.0    0.5
Name: Diabetes_binary, dtype: float64
New training set saved as 'train.csv'.


In [3]:
df = pd.read_csv('train.csv')
target_variable = 'Diabetes_binary'  # Change 'target' to your target column name if different
class_distribution = df[target_variable].value_counts(normalize=True)
X_train = df.drop('Diabetes_binary', axis=1)
y_train = df['Diabetes_binary']

print("Class distribution in the updated training set:")
print(class_distribution)

Class distribution in the updated training set:
0.0    0.5
1.0    0.5
Name: Diabetes_binary, dtype: float64


In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Initialize the MLPClassifier without class_weight (since it's not supported)
mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # More layers and neurons
    max_iter=1000,
    activation='relu',
    solver='adam',
    random_state=42,
    learning_rate_init=0.001,  # Default value, but you can tweak this
    early_stopping=True,  # To prevent overfitting
    validation_fraction=0.1,  # Fraction of training data to use as validation set for early stopping
    n_iter_no_change=10  # Number of iterations with no improvement to wait before stopping
)

# Train the MLP on the resampled, scaled training data
mlp.fit(X_train, y_train)

# Predict labels for the test set
y_pred = mlp.predict(X_val)

# Evaluate the performance
print(classification_report(y_val, y_pred))



              precision    recall  f1-score   support

         0.0       0.95      0.67      0.79     43667
         1.0       0.28      0.80      0.42      7069

    accuracy                           0.69     50736
   macro avg       0.62      0.73      0.60     50736
weighted avg       0.86      0.69      0.74     50736



In [11]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Convert continuous values to binary class labels
#y_train_binary = (y_train > 0.5).astype(int)

# Perform PCA with 2 components
#pca = PCA(n_components=4)
#X_train_pca = pca.fit_transform(X_train)
#X_val_pca = pca.transform(X_val)

# Visualize the data
#plt.figure(figsize=(8, 6))
#plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train_binary, cmap='viridis')
#plt.xlabel('Principal Component 1')
#plt.ylabel('Principal Component 2')
#plt.title('PCA of Training Data')
#plt.colorbar(label='Class')
#plt.show()


In [12]:
# Convert continuous values to binary class labels
y_train_binary = (y_train > 0.5).astype(int)

# Train SVM with linear kernel
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train_binary)

# Test on validation set
y_pred = svm_model.predict(X_val)

# Calculate evaluation metrics
conf_matrix = confusion_matrix(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Confusion Matrix:
[[29965 13702]
 [ 1580  5489]]
Accuracy: 0.6987937559129612
Precision: 0.2860194883018081
Recall: 0.7764888951761211
F1 Score: 0.418050266565118


<h1>Linear Kernel</h1>

In [10]:
import numpy as np

class LinearSVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y_ = np.where(y <= 0, -1, 1)
        
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * (-y_[idx])

    def predict(self, X):
        linear_output = np.dot(X, self.w) - self.b
        return np.sign(linear_output)

In [13]:
def print_classification_report(y_true, y_pred):
    """
    Prints a simple classification report including precision, recall, and F1-score for each class.
    """
    classes = np.unique(y_true)
    print("Class\tPrecision\tRecall\t\tF1-Score")
    
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"{cls}\t{precision:.2f}\t\t{recall:.2f}\t\t{f1_score:.2f}")

def load_data(filename):
    data = np.genfromtxt(filename, delimiter=',', skip_header=1)
    X = data[:, 1:]  # Assuming the first column is the target
    y = data[:, 0]
    return X, y

X_train, y_train = load_data('train.csv')
X_val, y_val = load_data('val.csv')

# Adjust these parameters as needed
svm = LinearSVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
svm.fit(X_train, y_train)

y_pred_val = svm.predict(X_val)


print("\nValidation Data Classification Report:")
print_classification_report(y_val, y_pred_val)


KeyboardInterrupt: 

<h1>RDF kernel FAILLLL</h1>

In [6]:
import numpy as np
import cvxopt
import cvxopt.solvers
import pandas as pd

cvxopt.solvers.options['show_progress'] = False

class SVM_RBF:
    def __init__(self, C=1.0, gamma=10):
        self.C = C
        self.gamma = float(gamma)
        self._support_vectors = None
        self._alphas = None
        self.intercept = None
        self._support_labels = None

    def rbf_kernel(self, x, y):
        x = np.atleast_2d(x)
        y = np.atleast_2d(y)
        return np.exp(-self.gamma * np.sum((x[:, np.newaxis] - y) ** 2, axis=2))

    def fit(self, data, labels):
        data_np = data.values if isinstance(data, pd.DataFrame) else data
        labels_np = labels.values if isinstance(labels, pd.Series) else labels

        num_data, num_features = data_np.shape
        labels_np = labels_np.astype(np.double)

        K = self.rbf_kernel(data_np, data_np)

        P = cvxopt.matrix(np.outer(labels_np, labels_np) * K)
        q = cvxopt.matrix(np.ones(num_data) * -1)
        A = cvxopt.matrix(labels_np, (1, num_data), 'd')
        b = cvxopt.matrix(0.0)

        G_max = cvxopt.matrix(np.diag(np.ones(num_data) * -1))
        G_min = cvxopt.matrix(np.diag(np.ones(num_data)))
        G = cvxopt.matrix(np.vstack((G_max, G_min)))
        h_max = cvxopt.matrix(np.zeros(num_data))
        h_min = cvxopt.matrix(np.ones(num_data) * self.C)
        h = cvxopt.matrix(np.vstack((h_max, h_min)))

        solution = cvxopt.solvers.qp(P, q, G, h, A, b)
        alphas = np.ravel(solution['x'])
        sv_mask = alphas > 1e-5
        self._support_vectors = data_np[sv_mask]
        self._alphas = alphas[sv_mask]
        self._support_labels = labels_np[sv_mask]

        # Calculate the intercept with corrected indexing
        sv_indices = np.where(sv_mask)[0]  # Indices of support vectors
        self.intercept = np.mean([
            y_k - np.sum(
                self._alphas * self._support_labels * K[i, sv_indices]
            ) for i, y_k in zip(sv_indices, self._support_labels)
        ])

    def predict(self, X):
        if self._alphas is None or self._support_labels is None:
            raise ValueError("SVM model has not been trained. Call the 'fit' method first.")

        X_np = X.values if isinstance(X, pd.DataFrame) else X
        K = self.rbf_kernel(X_np, self._support_vectors)
        score = np.dot(K, self._alphas * self._support_labels) + self.intercept
        return np.sign(score)


In [7]:
def print_classification_report(y_true, y_pred):
    """
    Prints a simple classification report including precision, recall, and F1-score for each class.
    """
    classes = np.unique(y_true)
    print("Class\tPrecision\tRecall\t\tF1-Score")
    
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"{cls}\t{precision:.2f}\t\t{recall:.2f}\t\t{f1_score:.2f}")



subset_size = 10000  # Adjust based on your dataset size and system capabilities
svm_rbf = SVM_RBF(C=1.0, gamma=0.1)
svm_rbf.fit(X_smote[:subset_size], y_smote[:subset_size])
y_pred_rbf = svm_rbf.predict(X_val[:subset_size])
print("SVM with RBF kernel - Classification Report on Validation Set (Subset):")
print_classification_report(y_val[:subset_size], y_pred_rbf)



SVM with RBF kernel - Classification Report on Validation Set (Subset):
Class	Precision	Recall		F1-Score
0.0	0.86		1.00		0.92
1.0	0.00		0.00		0.00
