In [55]:
!pip install xgboost



In [57]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [59]:
# Load the datasets
train_df = pd.read_csv(r"C:\Users\disha\Downloads\employee_train.csv")
test_df = pd.read_csv(r"C:\Users\disha\Downloads\employee_test.csv")

In [61]:
# Keep a copy of the original test dataset for submission
test_original = test_df.copy()

In [63]:
train_df.columns

Index(['S.No', 'Timestamp', 'Age', 'Gender', 'Country', 'state',
       'self_employed', 'family_history', 'treatment', 'work_interfere',
       'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')

In [65]:
# Drop unnecessary columns
drop_cols = ["S.No", "Timestamp", "comments","Country","state"]
train_df.drop(columns=drop_cols, inplace=True, errors="ignore")
test_df.drop(columns=drop_cols, inplace=True, errors="ignore")

In [67]:
def clean_gender(value):
    value = str(value).strip().lower()  # Convert to lowercase and remove spaces
    if value in ["male", "m","M"]:
        return "Male"
    elif value in ["female", "f","F"]:
        return "Female"
    else:
        return "Other"

In [69]:
# Apply cleaning to train and test data
train_df["Gender"] = train_df["Gender"].apply(clean_gender)
test_df["Gender"] = test_df["Gender"].apply(clean_gender)

In [71]:
# Handle missing values by filling with 'Unknown'
train_df.fillna("Unknown", inplace=True)
test_df.fillna("Unknown", inplace=True)

In [73]:
# Convert target variable to binary
y = train_df["treatment"].map({"Yes": 1, "No": 0})
X = train_df.drop(columns=["treatment"])

In [75]:
# One-Hot Encode categorical features
X = pd.get_dummies(X, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

In [77]:
X

Unnamed: 0,Age,Gender_Male,Gender_Other,self_employed_Unknown,self_employed_Yes,family_history_Yes,work_interfere_Often,work_interfere_Rarely,work_interfere_Sometimes,work_interfere_Unknown,...,coworkers_Yes,supervisor_Some of them,supervisor_Yes,mental_health_interview_No,mental_health_interview_Yes,phys_health_interview_No,phys_health_interview_Yes,mental_vs_physical_No,mental_vs_physical_Yes,obs_consequence_Yes
0,37,False,False,True,False,False,True,False,False,False,...,False,False,True,True,False,False,False,False,True,False
1,44,True,False,True,False,False,False,True,False,False,...,False,False,False,True,False,True,False,False,False,False
2,32,True,False,True,False,False,False,True,False,False,...,True,False,True,False,True,False,True,True,False,False
3,31,True,False,True,False,True,True,False,False,False,...,False,False,False,False,False,False,False,True,False,True
4,31,True,False,True,False,False,False,False,False,False,...,False,False,True,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043,26,True,False,False,False,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
1044,29,False,False,False,False,False,True,False,False,False,...,False,False,False,True,False,False,True,True,False,True
1045,26,False,False,False,False,True,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
1046,33,False,True,False,False,True,False,False,True,False,...,False,False,True,True,False,True,False,False,False,False


In [79]:
# Align test data columns with train data
test_df = test_df.reindex(columns=X.columns, fill_value=0)

In [81]:
# Split data for model validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [83]:
# Scale features (for models like SVM, Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_df_scaled = scaler.transform(test_df)

In [85]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "SVM": SVC(probability=True),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100)
}

In [87]:
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    if name in ["SVM", "Logistic Regression"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_val, y_pred))

Training Logistic Regression...
Logistic Regression Accuracy: 0.7762
              precision    recall  f1-score   support

           0       0.83      0.72      0.77       109
           1       0.73      0.84      0.78       101

    accuracy                           0.78       210
   macro avg       0.78      0.78      0.78       210
weighted avg       0.78      0.78      0.78       210

Training Random Forest...
Random Forest Accuracy: 0.7810
              precision    recall  f1-score   support

           0       0.84      0.72      0.77       109
           1       0.74      0.85      0.79       101

    accuracy                           0.78       210
   macro avg       0.79      0.78      0.78       210
weighted avg       0.79      0.78      0.78       210

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.7524
              precision    recall  f1-score   support

           0       0.77      0.74      0.76       109
           1       0.73      0.76      0.75       101

    accuracy                           0.75       210
   macro avg       0.75      0.75      0.75       210
weighted avg       0.75      0.75      0.75       210

Training SVM...
SVM Accuracy: 0.8000
              precision    recall  f1-score   support

           0       0.88      0.72      0.79       109
           1       0.74      0.89      0.81       101

    accuracy                           0.80       210
   macro avg       0.81      0.80      0.80       210
weighted avg       0.81      0.80      0.80       210

Training Gradient Boosting...
Gradient Boosting Accuracy: 0.7905
              precision    recall  f1-score   support

           0       0.84      0.74      0.79       109
           1       0.75      0.84      0.79       101

    accuracy                           0.79       210
 

In [91]:
# Choose the best model (highest accuracy)
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with Accuracy: {results[best_model_name]:.4f}")


Best Model: SVM with Accuracy: 0.8000


In [93]:
# Make final predictions with the best model
if best_model_name in ["SVM", "Logistic Regression"]:
    test_predictions = best_model.predict(test_df_scaled)
else:
    test_predictions = best_model.predict(test_df)

In [95]:
# Convert predictions back to labels
test_predictions = ["Yes" if pred == 1 else "No" for pred in test_predictions]

In [97]:
# Add predictions to the original test dataset
test_original["treatment"] = test_predictions

In [99]:
# Generate submission file
test_original.to_csv("final_output.csv", index=False)

In [101]:
print("\nSubmission file generated: final_output.csv")


Submission file generated: final_output.csv
