In [1]:
! pip install pandas scikit-learn




In [3]:
import pandas as pd
import os
import joblib
import uuid
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

data = pd.read_csv("../data/raw/train_sample_10p.csv")  

target_column = 'Depression'  
X = data.drop(columns=[target_column])  
y = data[target_column]  

# Separate numeric and non-numeric columns
numeric_cols = X.select_dtypes(include=['number']).columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns

# Fill missing values for numeric columns with the mean
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

# Fill missing values for non-numeric columns with the mode
for col in non_numeric_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

# One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)  

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train the SVM Model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

#Evaluating the Model
y_pred = svm_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

run_id = str(uuid.uuid4()) 
print(f"Model Run ID: {run_id}")

# Save the model
model_dir = "../data/model"
os.makedirs(model_dir, exist_ok=True) 
model_path = os.path.join(model_dir, f"svm_model_{run_id}.joblib")
joblib.dump(svm_model, model_path)

print(f"Model saved at: {model_path}")


Confusion Matrix:
[[2255   48]
 [ 147  364]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2303
           1       0.88      0.71      0.79       511

    accuracy                           0.93      2814
   macro avg       0.91      0.85      0.87      2814
weighted avg       0.93      0.93      0.93      2814


Accuracy Score:
0.9307036247334755
Model Run ID: 08defbff-21ba-49ee-9f77-f36bbdf0cb6e
Model saved at: ../data/model/svm_model_08defbff-21ba-49ee-9f77-f36bbdf0cb6e.joblib
