In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
import joblib  

# Load the pre-trained logistic regression model and scaler
model = joblib.load('logreg_model.pkl')  # Load saved model for predictions

# Load the new dataset (Legal Occupations dataset)
df = pd.read_csv("Community_and_Social_Service_Occupations.csv")

# Display the first few rows of the dataset to understand its structure
print(df.head())

# Convert 'Automatability' into binary labels (0 and 1) based on threshold of 0.5
df['Automatibility_Label'] = (df['Automatability'] >= 0.5).astype(int)

# Drop the original 'Automatability' column as it's no longer needed
df.drop(columns=['Automatability'], inplace=True)

# Convert 'Task Type' to binary values (0 for Core, 1 for Supplemental)
df['Task Type'] = df['Task Type'] - 1  # 1 becomes 0 (Core), 2 becomes 1 (Supplemental)

# Encode 'Scale Name' categorical variable into numeric labels
label_encoder = LabelEncoder()
df['Scale Name'] = label_encoder.fit_transform(df['Scale Name'])

# Remove unnecessary columns like 'O*NET-SOC Code', 'Task ID', etc.
columns_to_drop = ["O*NET-SOC Code", "Task ID", "Task_x", "Title", "Category"]
df = df.drop(columns=columns_to_drop, errors='ignore')

# Handle missing values by replacing them with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Define feature matrix (X) and target variable (y)
X = df.drop(columns=["Automatibility_Label"])
y = df["Automatibility_Label"]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use the pre-trained model to make predictions on the test data
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]  # Get probabilities for ROC curve

# Evaluate the model's performance using classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate ROC curve and AUC for model evaluation
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve with AUC score
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")  # Diagonal line for random classifier
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

# Print AUC score for the model
print(f"AUC Score: {roc_auc:.4f}")
