In [None]:
%pip install scikit-learn
%pip install pandas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Setting up directory/data

# Load the dataset
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

# Convert the target variable 'Attrition' to 0 (No) or 1 (Yes)
df['Attrition'] = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

# Drop columns that are useless for prediction
# (e.g., constant values or unique identifiers)
df = df.drop(['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18'], axis=1)

# Step 2: Setting up the features/changing the data for the model to correctly interperet

# Separate features (X) and target (y)
y = df['Attrition']
X = df.drop('Attrition', axis=1)

# One-Hot Encoding: Convert all categorical text columns into numbers
X_encoded = pd.get_dummies(X, drop_first=True)

# Get the list of all feature names after encoding
feature_names = X_encoded.columns.tolist()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=42, stratify=y)

print(f"Training model on {len(X_train)} samples...")

# Step 3: Training the model

# Using RandomForestClassifier, a great "off-the-shelf" model
# random_state=42 ensures you get the same results every time you run it
model = RandomForestClassifier(random_state=42, n_estimators=100)

# Train the model
model.fit(X_train, y_train)

# Step 4: making predictions to evaluate the model

# Make predictions on the test set
y_pred = model.predict(X_test)

# Check the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("-" * 30)

# Step 5: Find out the most important factors. 

print("Top 10 Most Important Factors for Attrition:")

# Get feature importances from the trained model
importances = model.feature_importances_

# Create a DataFrame to view them clearly
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort by importance and print the top 10
top_10_features = importance_df.sort_values(by='Importance', ascending=False).head(10)
print(top_10_features.to_string(index=False))

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.
Training model on 1102 samples...
Model Accuracy: 83.42%
------------------------------
Top 10 Most Important Factors for Attrition:
           Feature  Importance
     MonthlyIncome    0.075142
               Age    0.061101
         DailyRate    0.054607
 TotalWorkingYears    0.052489
  DistanceFromHome    0.047524
    YearsAtCompany    0.047290
        HourlyRate    0.045831
       MonthlyRate    0.045651
      OverTime_Yes    0.040172
NumCompaniesWorked    0.036898
