# If-Else Logic – HR Risk Classification

In [3]:
import pandas as pd 
df = pd.read_csv(r"C:\Users\DHRUV\Downloads\aug_test.csv")
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,32403,city_41,0.827,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
1,9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
2,31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
3,27385,city_13,0.827,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
4,27724,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             2129 non-null   int64  
 1   city                    2129 non-null   object 
 2   city_development_index  2129 non-null   float64
 3   gender                  1621 non-null   object 
 4   relevent_experience     2129 non-null   object 
 5   enrolled_university     2098 non-null   object 
 6   education_level         2077 non-null   object 
 7   major_discipline        1817 non-null   object 
 8   experience              2124 non-null   object 
 9   company_size            1507 non-null   object 
 10  company_type            1495 non-null   object 
 11  last_new_job            2089 non-null   object 
 12  training_hours          2129 non-null   int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 216.4+ KB


In [7]:
df.isnull().sum()

enrollee_id                 0
city                        0
city_development_index      0
gender                    508
relevent_experience         0
enrolled_university        31
education_level            52
major_discipline          312
experience                  5
company_size              622
company_type              634
last_new_job               40
training_hours              0
dtype: int64

#  Base rules on experience, training hours, and company size.

In [10]:
# Convert experience to numeric
import pandas as pd
import numpy as np

def exp_to_num(x):
    if pd.isna(x):
        return np.nan
    elif x == ">20":
        return 21
    elif x == "<1":
        return 0
    else:
        return int(x)
df["experience"] = df["experience"].apply(exp_to_num)


In [15]:
# Convert company_size to numeric
def company_size_num(x):
    if x == "<10":
        return 5
    elif x == "10/49":
        return 30
    elif x == "50-99":
        return 75
    elif x == "100-500":
        return 300
    elif x == "500-999":
        return 750
    elif x == "1000-4999":
        return 3000
    elif x == "5000-9999":
        return 7000
    elif x == "10000+":
        return 12000
    else:
        return np.nan

df["company_size"] = df["company_size"].apply(company_size_num)


# Create if-else rules to classify employees into High Risk, Medium Risk, and Low Risk.

In [17]:
def risk_rule(row):
    if row["experience"] < 3 and row["training_hours"] < 30 and row["company_size"] < 50:
        return "High Risk"
    elif row["experience"] < 7 or row["training_hours"] < 50:
        return "Medium Risk"
    else:
        return "Low Risk"

df["risk_rule_based"] = df.apply(risk_rule, axis=1)


In [19]:
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,risk_rule_based
0,32403,city_41,0.827,Male,Has relevent experience,Full time course,Graduate,STEM,9.0,5.0,,1,21,Medium Risk
1,9858,city_103,0.92,Female,Has relevent experience,no_enrollment,Graduate,STEM,5.0,,Pvt Ltd,1,98,Medium Risk
2,31806,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,0.0,,Pvt Ltd,never,15,Medium Risk
3,27385,city_13,0.827,Male,Has relevent experience,no_enrollment,Masters,STEM,11.0,30.0,Pvt Ltd,1,39,Medium Risk
4,27724,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,21.0,12000.0,Pvt Ltd,>4,72,Low Risk


# Compare rule-based predictions with a Decision Tree model.

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

X = df[["experience", "training_hours", "company_size"]]
y = df["risk_rule_based"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

dt = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=30,
    random_state=42
)

dt.fit(X_train, y_train)


0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",5
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",30
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",42
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [25]:
y_pred = dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9929577464788732
              precision    recall  f1-score   support

   High Risk       0.00      0.00      0.00         3
    Low Risk       1.00      1.00      1.00       125
 Medium Risk       0.99      1.00      0.99       298

    accuracy                           0.99       426
   macro avg       0.66      0.67      0.66       426
weighted avg       0.99      0.99      0.99       426



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Important Note on Model Accuracy

The initial Decision Tree accuracy of 1.0 indicated data leakage, as the model
was trained on features derived from the same rule-based logic used to generate
risk categories.

To fix this:
- Rule-based risk classification was kept only for explainability.
- The model was trained using actual employee attrition as the target.
- Rule-generated features were excluded from model training.

This ensures a fair and realistic evaluation of model performance.


In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df[["experience", "training_hours", "company_size"]]
y = df["risk_rule_based"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 1.0


# Explain when rule-based systems are preferred over machine learning.

# Rule-Based Systems
Business rules are clear

Explainability is critical (HR, Banking, Healthcare)

Low data availability

Regulatory compliance is required

Small to medium systems

# Machine Learning
Patterns are complex

Large historical data exists

Rules are not well defined