### 📌 Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### 📌 Step 2: Load the CSV

In [None]:
df = pd.read_csv("employee_performance_data.csv")
print("✅ Data Loaded Successfully")
print("Dataset shape:", df.shape)
df.head()

### 📌 Step 3: Preprocessing

In [None]:
print("\nMissing Values:\n", df.isnull().sum())
df.drop_duplicates(inplace=True)
print("\nData Types:\n", df.dtypes)

In [None]:
# Encode categorical columns
df_encoded = pd.get_dummies(df, columns=["Department", "Team"], drop_first=True)

In [None]:
# Normalize numeric features
scaler = MinMaxScaler()
numeric = ["Task_Completion_Efficiency", "Attendance_Regularity", "Feedback_Rating", "Length_of_Service_Years"]
df_scaled = scaler.fit_transform(df[numeric])

### 📌 Step 4: Elbow Method & KMeans Clustering

In [None]:
inertia = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(range(2, 11), inertia, marker='o')
plt.title("Elbow Method")
plt.xlabel("k (Number of Clusters)")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()

In [None]:
# Apply KMeans with k=4
kmeans = KMeans(n_clusters=4, random_state=42)
df['Performance_Cluster'] = kmeans.fit_predict(df_scaled)

df.groupby("Performance_Cluster")[numeric].mean()

### 📌 Step 5: EDA (Visual Insights)

In [None]:
for col in numeric:
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

In [None]:
sns.heatmap(df[numeric].corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation")
plt.show()

In [None]:
sns.countplot(x='Performance_Cluster', data=df)
plt.title("Employees per Cluster")
plt.show()

### 📌 Step 6: Feature Engineering

In [None]:
df["Avg_Performance_Score"] = (
    df["Task_Completion_Efficiency"] +
    df["Attendance_Regularity"] +
    df["Feedback_Rating"] * 20
) / 3

In [None]:
threshold = df["Avg_Performance_Score"].quantile(0.80)
df["High_Performer"] = (df["Avg_Performance_Score"] >= threshold).astype(int)

### 📌 Step 7: Build ML Model – Classification

In [None]:
X = df[["Task_Completion_Efficiency", "Attendance_Regularity", "Feedback_Rating", "Length_of_Service_Years"]]
y = df["High_Performer"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

### 📌 Step 8: Final Export for Power BI

In [None]:
df.to_csv("employee_data_with_clusters_and_labels.csv", index=False)
print("✅ Final dataset exported for Power BI.")