In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import FastICA  # Change to FastICA for ICA
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Load the dataset
df = pd.read_csv("diabetes_prediction_dataset.csv")

# Handle duplicates
df = df.drop_duplicates()

# Remove Unnecessary value [0.00195%]
df = df[df['gender'] != 'Other']


In [3]:
# Recategorize smoking history
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)


In [4]:
# One-hot encoding
def perform_one_hot_encoding(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)
    return df

data = df.copy()
data = perform_one_hot_encoding(data, 'gender')
data = perform_one_hot_encoding(data, 'smoking_history')

In [5]:
# Split data into features and target variable
X = data.drop('diabetes', axis=1)
y = data['diabetes']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Apply ICA for dimensionality reduction
ica = FastICA(n_components=5)  # Choose the number of components based on your requirements


In [7]:
X_train_ica = ica.fit_transform(X_train)
X_test_ica = ica.transform(X_test)

# Train SVM classifier
svm_classifier = SVC(C=1, gamma=0.1, kernel='rbf', decision_function_shape='ovo')
svm_classifier.fit(X_train_ica, y_train)

In [8]:
# Predict on the test set using the model with ICA
y_pred_svm_ica = svm_classifier.predict(X_test_ica)

# Evaluate the model with ICA
print("SVM Model Accuracy with ICA: ", accuracy_score(y_test, y_pred_svm_ica))
print(classification_report(y_test, y_pred_svm_ica))

SVM Model Accuracy with ICA:  0.9599500676167689
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     17525
           1       1.00      0.55      0.71      1701

    accuracy                           0.96     19226
   macro avg       0.98      0.77      0.84     19226
weighted avg       0.96      0.96      0.95     19226

