In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv("Healthcare_Support_Occupations.csv")

# Check the first few rows of the dataset
print(df.head())

# Step 1: Process 'Automatability' - Convert to 0 and 1 based on threshold of 0.5
df['Automatibility_Label'] = (df['Automatability'] >= 0.5).astype(int)

# Drop the original 'Automatability' column
df.drop(columns=['Automatability'], inplace=True)

# Step 2: Encode 'Task Type' (1 for Core, 2 for Supplemental) into binary values (0 and 1)
# Task Type: 1 for Core, 2 for Supplemental (We'll map them to 0 and 1)
df['Task Type'] = df['Task Type'] - 1  # Mapping 1 to 0 (Core), 2 to 1 (Supplemental)

# Step 3: Encode 'Scale Name' - categorical feature
label_encoder = LabelEncoder()
df['Scale Name'] = label_encoder.fit_transform(df['Scale Name'])

# Step 4: Drop unnecessary columns (O*NET-SOC Code, Task ID, Task_x, Title, etc.)
columns_to_drop = ["O*NET-SOC Code", "Task ID", "Task_x", "Title", "Category"]
df = df.drop(columns=columns_to_drop, errors='ignore')  # Drop columns that are not needed

# Step 5: Handle missing values (replace with column mean)
df.fillna(df.mean(), inplace=True)

# Step 6: Split data into features (X) and target variable (y)
X = df.drop(columns=["Automatibility_Label"])
y = df["Automatibility_Label"]

# Step 7: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 8: Feature scaling (standardize features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 9: Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Step 10: Perform cross-validation (5-fold cross-validation)
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Step 11: Predict on the test set
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for ROC curve

# Step 12: Evaluate the model with classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 13: ROC curve and AUC calculation
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Step 14: Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

# Print AUC score
print(f"AUC Score: {roc_auc:.4f}")
