In [1]:
! pip install pandas scikit-learn




In [4]:
import pandas as pd
import os
import joblib
import uuid
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Load Sampled Data
data = pd.read_csv("../data/raw/train_sample_10p.csv")  # Ensure the sample exists in this path

# Replace 'target_column' with your actual target column name
target_column = 'Depression'  # Replace with your actual target variable
X = data.drop(columns=[target_column])  # Features: All columns except the target
y = data[target_column]  # Target column

# Step 2: Handle Missing Values
numeric_cols = X.select_dtypes(include=['number']).columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns

X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())
for col in non_numeric_cols:
    X[col] = X[col].fillna(X[col].mode()[0])

# Encode Non-Numeric Columns
X = pd.get_dummies(X, drop_first=True)

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Train SVM Model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate the Model
y_pred = svm_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

# Save the Model
run_id = str(uuid.uuid4())  # Generate a unique ID for this run
run_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Record the current date and time
model_name = "SVM"
model_dir = "../data/model"
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, f"svm_model_{run_id}.joblib")
joblib.dump(svm_model, model_path)
print(f"Model saved at: {model_path}")

# Step 3: Log Model Metadata in model_registry.csv
registry_file = "../data/model/model_registry.csv"
if not os.path.exists(registry_file):
    # Create the file with headers if it doesn't exist
    registry_df = pd.DataFrame(columns=["run_date", "model_name", "model_path"])
else:
    # Load the existing registry
    registry_df = pd.read_csv(registry_file)

# Append the new model metadata
new_entry = {"run_date": run_date, "model_name": model_name, "model_path": model_path}
registry_df = pd.concat([registry_df, pd.DataFrame([new_entry])], ignore_index=True)

# Save the updated registry
registry_df.to_csv(registry_file, index=False)
print(f"Model metadata logged in {registry_file}")


Confusion Matrix:
[[2255   48]
 [ 147  364]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2303
           1       0.88      0.71      0.79       511

    accuracy                           0.93      2814
   macro avg       0.91      0.85      0.87      2814
weighted avg       0.93      0.93      0.93      2814


Accuracy Score:
0.9307036247334755
Model saved at: ../data/model/svm_model_ef898153-f837-4e9e-a75f-695a9edacace.joblib
Model metadata logged in ../data/model/model_registry.csv
