In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

data = pd.read_csv("employee_feedback.csv")  # Load HR data (employee feedback + HR metrics)


def get_sentiment_score(text):
    if pd.isna(text):  # Handle missing feedback
        return 0  # Neutral
    return sia.polarity_scores(text)["compound"]

data["Sentiment_Score"] = data["Feedback"].apply(get_sentiment_score)

encoder = LabelEncoder()
data["Department"] = encoder.fit_transform(data["Department"])
data["Job_Role"] = encoder.fit_transform(data["Job_Role"])

scaler = StandardScaler()
data[["Salary", "Work_Hours_Per_Week"]] = scaler.fit_transform(data[["Salary", "Work_Hours_Per_Week"]])

X = data[["Department", "Job_Role", "Salary", "Work_Hours_Per_Week", "Sentiment_Score"]]
y = data["Attrition"].map({"Yes": 1, "No": 0})  # Convert Attrition to binary (1 = Yes, 0 = No)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_t"rain)

y_pred = model.predict(X_test)

print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
