# Implement SVM to classify samples as CVD or not 
## Workflow: 
1. Read in data
1. Prepare SVM
     1. Set parameters
     2. Train model: Compute gradients and update weights and bias
     3. Use model to classify test samples
1. Perform kfold cross-validation to assess performance
1. Attempt to optimize performance by feature selection and changing parameters

In [2]:
#Import libraries 
import opendatasets as od 
import pandas as pd
import numpy as np

#Read dataset. Note that this requires a Kaggle account
od.download("https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset") 

file=('cardiovascular-disease-dataset/cardio_train.csv') 
df = pd.read_csv(file,sep=';') 
  
#Preview the data
df.head() 

#Since 'weight' is float, need to change it to int
df = df.astype(int)

Skipping, found downloaded files in ".\cardiovascular-disease-dataset" (use force=True to force download)


In [7]:
class SVM:
    def __init__(self, C=1.0, max_iter=1000, learning_rate=0.01): #Initialize parameters
        self.C = C  #Regularization parameter (penalty)
        self.max_iter = max_iter #Maximum number of iterations
        self.learning_rate = learning_rate #Step size for gradient descent
        self.w = None 
        self.b = None

    #Train the SVM model using gradient descent
    def fit(self, X, y):
        n_samples, n_features = X.shape
        # Initialize weights and bias
        self.w = np.zeros(n_features)
        self.b = 0
        
        # Gradient descent
        for _ in range(self.max_iter):
            for i in range(n_samples):
                condition = y[i] * (np.dot(X[i], self.w) + self.b) >= 1 #Decision boundary for the i-th point
                if condition: #If classified correctly, the weight remains the same
                    dw = self.w  #No regularization if condition met
                    db = 0
                else: #If point classified incorrectly
                    dw = self.w - self.C * y[i] * X[i] #Penalization steps
                    db = -self.C * y[i] 
                
                #Update weights and bias
                self.w -= self.learning_rate * dw
                self.b -= self.learning_rate * db

    def classify(self, X):
        #Linear decision rule
        return np.sign(np.dot(X, self.w) + self.b)

In [5]:
#K-fold cross-validation 
def kfold_crossvalidation(feats,targets,k=3,lr=0.01,penalty=1.0): #feats is features to train the model on, targets is the target variable (CVD), k is number of folds 
    #Find number of samples
    n_samples = feats.shape[0]

    #Determine fold size
    fold_size= n_samples//k #Use // to return only integer

    #List to store the performance scores
    performance = []

    for i in range(k):
        #Define test and train indices
        test_start = i * fold_size
        test_end = (i + 1) * fold_size if i != k - 1 else n_samples

        # Split data into train and test (features and targets)
        feats_train = np.concatenate([feats[:test_start], feats[test_end:]], axis=0)
        targets_train = np.concatenate([targets[:test_start], targets[test_end:]], axis=0)
        feats_test = feats[test_start:test_end]
        targets_test = targets[test_start:test_end]

        #Train model
        model = SVM(learning_rate=lr)
        model.fit(feats_train,targets_train)
        
        #Test model
        pred = model.classify(feats_test)

        #Calculate performance
        def performance_sys(test_true, prediction):
            # Count correct predictions
            correct_predictions = sum([1 if test_true == prediction else 0 for test_true, prediction in zip(test_true, prediction)])
    
            # Calculate accuracy
            accuracy = correct_predictions / len(test_true)

            return accuracy
            
        #Run performance calculation 
        performance_score = [] #Init performance score
        performance = performance_sys(np.array(targets_test),pred)    
        performance_score.append(performance)

    #Find average performance
    score = np.mean(performance_score)
    return score    

In [8]:
#Run the above (log regression and cross validation)
kfold_crossvalidation(df.drop(columns=['id','cardio'], inplace=False), df.loc[:,'cardio'], k=5) #Set features as all information except id and target, set CVD status as target

0.5005714285714286

##### Here we see that not updating any parameters, not utilizing any EDA, and using all features results in a 0.5006 (50.06%) accuracy score. Below, I will try to improve this. 

In [9]:
df1 = pd.get_dummies(df, columns=['cholesterol', 'gluc'])
df1 = df1*1
df1.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3
0,0,18393,2,168,62,110,80,0,0,1,0,1,0,0,1,0,0
1,1,20228,1,156,85,140,90,0,0,1,1,0,0,1,1,0,0
2,2,18857,1,165,64,130,70,0,0,0,1,0,0,1,1,0,0
3,3,17623,2,169,82,150,100,0,0,1,1,1,0,0,1,0,0
4,4,17474,1,156,56,100,60,0,0,0,0,1,0,0,1,0,0


In [10]:
kfold_crossvalidation(df1.loc[:,['age','cholesterol_1','cholesterol_2','cholesterol_3','weight']], df1.loc[:,'cardio'], k=5) #Removing variables with little predictive power did not increase performance. 

0.5005714285714286

In [11]:
kfold_crossvalidation(df1.loc[:,['age','cholesterol_1','cholesterol_2','cholesterol_3','weight']], df1.loc[:,'cardio'], k=5,lr=0.0001) #Changing learning rate did not affect result

0.5005714285714286

## Future directions:
### Adding a function to monitor the gradient descent could aid in troubleshooting and optimizing parameters. Additionally, further EDA (particularly for feature selection) or exploring scaling numerical features might improve performance. Further exploration of the learning rate, as well as testing other similar datasets, could also inform method optimization. 