In [None]:
import pandas as pd
df = pd.read_csv("/content/WA_Fn-UseC_-HR-Employee-Attrition (1).csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
sns.set(style='whitegrid')

In [None]:
sns.countplot(data=df,x='Attrition',palette='pastel')
plt.title("Attrition Distribution")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df,x='Department',hue='Attrition',palette='Set2')
plt.title("attrition by Department")
plt.xticks(rotation=15)
plt.show()


In [None]:
sns.countplot(data=df,x='OverTime',hue='Attrition',palette='Set1')
plt.title("Attrition by OverTime")
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df,x='Attrition',y='MonthlyIncome',palette='coolwarm')
plt.title("Monthly Income by Attrition")
plt.show()

In [None]:
df_corr = df.copy()
df_corr['Attrition']=df_corr['Attrition'].map({'Yes':1,'No':0})


In [None]:
numeric_df=df_corr.select_dtypes(include='number')

In [None]:
corr_matrix = numeric_df.corr()
attrition_corr = corr_matrix['Attrition'].sort_values(ascending=False)
print(attrition_corr)

In [None]:
df_corr['StandardHours'].unique()
df_corr['StandardHours'].unique()

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True,fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
df_clean=df.copy()
df_clean=df_clean.drop(columns=['EmployeeCount','StandardHours','Over18','EmployeeNumber'])

In [None]:
df_clean['Attrition']= df_clean['Attrition'].str.strip().map({'Yes':1,'No':0})

In [None]:
df_encoded=pd.get_dummies(df_clean,drop_first=True)

In [None]:
#cleaned dataset
#converted all categorical columns to numeric
#removed unnecessary columns
#ready to train ML models

In [None]:
print(df_encoded.shape)
df_encoded.head()

In [None]:
#splitting data into features(x) and target(y)

In [None]:
X = df_encoded.drop('Attrition',axis=1)
y=df_encoded['Attrition']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Get feature importances
importances = rf_model.feature_importances_
features = X.columns
feat_imp = pd.Series(importances, index=features).sort_values(ascending=False)

# Plot top 10 important features
plt.figure(figsize=(10,6))
feat_imp[:10].plot(kind='bar')
plt.title('Top 10 Feature Importances (Random Forest)')
plt.ylabel('Importance Score')
plt.xlabel('Features')
plt.show()