In [66]:
!pip -q install -r requirements.txt

In [174]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

np.random.seed(123)
RANDOM_STATE = 42

In [175]:
df = pd.read_csv('HR_comma_sep.csv')

In [176]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [177]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [178]:
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low


In [179]:
df=df.rename(columns={'average_montly_hours':'average_monthly_hours', 'time_spend_company':'tenure','Work_accident':'work_accident','Department':'department','promotion_last_5years':'promotion_last_five_years'})

In [180]:
df = df.drop_duplicates(keep='first')
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,tenure,work_accident,left,promotion_last_five_years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
11995,0.90,0.55,3,259,10,1,0,1,management,high
11996,0.74,0.95,5,266,10,0,0,1,management,high
11997,0.85,0.54,3,185,10,0,0,1,management,high
11998,0.33,0.65,3,172,10,0,0,1,marketing,high


In [181]:
df_encoded = df.copy()
df_encoded['salary']=df_encoded['salary'].map({'low':1,'medium':2,'high':3})
df_encoded.head()
df_encoded=pd.get_dummies(df_encoded,drop_first=True, dtype=int)
df_encoded.head()
df_encoded['department_RandD']=df_encoded['department_RandD'].astype(int)
df_encoded['department_accounting']=df_encoded['department_accounting'].astype(int)
df_encoded['department_hr']=df_encoded['department_hr'].astype(int)
df_encoded['department_management']=df_encoded['department_management'].astype(int)
df_encoded['department_marketing']=df_encoded['department_marketing'].astype(int)
df_encoded['department_product_mng']=df_encoded['department_product_mng'].astype(int)
df_encoded['department_sales']=df_encoded['department_sales'].astype(int)
df_encoded['department_support']=df_encoded['department_support'].astype(int)
df_encoded['department_technical']=df_encoded['department_technical'].astype(int)
df = df_encoded
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,tenure,work_accident,left,promotion_last_five_years,salary,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,0.38,0.53,2,157,3,0,1,0,1,0,0,0,0,0,0,1,0,0
1,0.80,0.86,5,262,6,0,1,0,2,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,2,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,1,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.90,0.55,3,259,10,1,0,1,3,0,0,0,1,0,0,0,0,0
11996,0.74,0.95,5,266,10,0,0,1,3,0,0,0,1,0,0,0,0,0
11997,0.85,0.54,3,185,10,0,0,1,3,0,0,0,1,0,0,0,0,0
11998,0.33,0.65,3,172,10,0,0,1,3,0,0,0,0,1,0,0,0,0


# Train/Test split

In [186]:
X = df.drop('left', axis=1)
y = df['left']

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

In [198]:
print("\nTrain")
print("Features shape:", X_train.shape)
print("Target shape:", y_train.shape)

print("\nTest")
print("Features shape:", X_test.shape)
print("Target shape:", y_test.shape)


Train
Features shape: (8393, 17)
Target shape: (8393,)

Test
Features shape: (3598, 17)
Target shape: (3598,)


# Balancing the data

In [199]:
def balance_data(X_train, y_train, method):
    X_resampled, y_resampled = None, None
    if method == 'smote':
        smote = SMOTE(random_state=RANDOM_STATE)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    return X_resampled, y_resampled

In [200]:
X_train_resampled, y_train_resampled = balance_data(X_train, y_train, 'smote')

In [201]:
print("\nTrain")
print("Features shape:", X_train_resampled.shape)
print("Target shape:", y_train_resampled.shape)


Train
Features shape: (14010, 17)
Target shape: (14010,)


## Scaling Data

In [202]:
def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train_scaled, X_test_scaled

In [204]:
X_train_resampled_scaled, X_test_scaled = scale_data(X_train_resampled, X_test)
X_train, y_train = X_train_resampled_scaled, y_train_resampled
X_train

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,tenure,work_accident,promotion_last_five_years,salary,department_RandD,department_accounting,department_hr,department_management,department_marketing,department_product_mng,department_sales,department_support,department_technical
0,1.527062,-1.964844,-0.568853,-1.720939,-1.295248,-0.324001,-0.103328,2.561918,-0.197312,-0.193969,-0.195155,-0.158657,-0.209305,-0.204048,1.74819,-0.379781,-0.429975
1,-1.345498,1.567982,0.133255,-0.301157,0.386846,3.086409,-0.103328,0.880063,-0.197312,-0.193969,-0.195155,-0.158657,4.777712,-0.204048,-0.57202,-0.379781,-0.429975
2,-0.375543,-0.843312,-1.270962,-1.038706,-0.454201,-0.324001,-0.103328,2.561918,-0.197312,-0.193969,-0.195155,-0.158657,4.777712,-0.204048,-0.57202,-0.379781,-0.429975
3,0.631719,0.390373,0.133255,-0.928074,-0.454201,-0.324001,-0.103328,0.880063,-0.197312,-0.193969,-0.195155,-0.158657,-0.209305,-0.204048,-0.57202,-0.379781,-0.429975
4,-0.039789,0.558603,-1.270962,1.376768,-0.454201,3.086409,-0.103328,-0.801792,-0.197312,-0.193969,-0.195155,-0.158657,4.777712,-0.204048,-0.57202,-0.379781,-0.429975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14005,-0.421015,-1.289276,-1.270962,-1.407481,-0.454201,-0.324001,-0.103328,-0.801792,-0.197312,-0.193969,-0.195155,-0.158657,-0.209305,-0.204048,1.74819,-0.379781,-0.429975
14006,1.228614,1.083281,0.133255,0.528586,2.068940,-0.324001,-0.103328,-0.801792,-0.197312,-0.193969,-0.195155,-0.158657,-0.209305,-0.204048,-0.57202,-0.379781,2.325718
14007,0.745728,0.868597,0.835364,0.676096,1.227893,-0.324001,-0.103328,-0.801792,-0.197312,-0.193969,-0.195155,-0.158657,-0.209305,-0.204048,-0.57202,2.633094,-0.429975
14008,-0.616825,-1.003227,-1.270962,-1.370603,-0.454201,-0.324001,-0.103328,-0.801792,-0.197312,-0.193969,-0.195155,-0.158657,-0.209305,-0.204048,-0.57202,-0.379781,2.325718


# Logistic Regression

In [205]:
# Setup the pipeline steps
steps_logistic = [('logreg', LogisticRegression(max_iter=1000))]

# Create the pipeline
pipeline_logistic = Pipeline(steps_logistic)

# Define the parameter grid
param_grid_logistic = {
    'logreg__C': np.logspace(-4, 4, 20),
    'logreg__penalty': ['l2'],
    'logreg__solver': ['sag', 'saga'] # stochastic average gradient
}

# Setup the GridSearchCV
grid_logistic = GridSearchCV(pipeline_logistic, param_grid_logistic, cv=5, scoring='accuracy')
grid_logistic.fit(X_train, y_train)
best_logistic_model = grid_logistic.best_estimator_


# Best model
print("Best parameters (Logistic Regression):", grid_logistic.best_params_)
print("Best score (Logistic Regression):", grid_logistic.best_score_)


Best parameters (Logistic Regression): {'logreg__C': np.float64(0.012742749857031334), 'logreg__penalty': 'l2', 'logreg__solver': 'saga'}
Best score (Logistic Regression): 0.8214132762312634


In [206]:
y_pred = best_logistic_model.predict(X_test_scaled)
y_prob = best_logistic_model.predict_proba(X_test_scaled) # To get the probability of each class
df_test = pd.DataFrame({'True_Y': y_test, 'Predicted_Y': y_pred})
df_test['Prob_Class_0'] = y_prob[:, 0]  # Probability of class 0
df_test['Prob_Class_1'] = y_prob[:, 1]  # Probability of class 1
df_test['Correct_Prediction'] = df_test['True_Y'] == df_test['Predicted_Y']
df_test

Unnamed: 0,True_Y,Predicted_Y,Prob_Class_0,Prob_Class_1,Correct_Prediction
397,1,1,0.070833,0.929167,True
3107,0,0,0.780317,0.219683,True
9331,0,0,0.786991,0.213009,True
357,1,1,0.221455,0.778545,True
1326,1,1,0.233452,0.766548,True
...,...,...,...,...,...
10403,0,0,0.803642,0.196358,True
9326,0,1,0.475137,0.524863,False
3555,0,0,0.706047,0.293953,True
3536,0,1,0.044581,0.955419,False


In [207]:
acc = accuracy_score(df_test['True_Y'], df_test['Predicted_Y'])
print("Accuracy score is:", acc)

conf_matrix = confusion_matrix(df_test['True_Y'], df_test['Predicted_Y'])
print("Confusion Matrix:")
print(conf_matrix)

f1_per_class = f1_score(df_test['True_Y'], df_test['Predicted_Y'], average=None)
print("F1-score per class is:", f1_per_class)

f1_macro = f1_score(df_test['True_Y'], df_test['Predicted_Y'], average='macro')
print("Macro F1-score is:", f1_macro)

f1_micro = f1_score(df_test['True_Y'], df_test['Predicted_Y'], average='micro')
print("Micro F1-score is:", f1_micro)

f1_weighted = f1_score(df_test['True_Y'], df_test['Predicted_Y'], average='weighted')
print("Weighted F1-score is:", f1_weighted)

Accuracy score is: 0.7837687604224569
Confusion Matrix:
[[2417  578]
 [ 200  403]]
F1-score per class is: [0.8613685  0.50883838]
Macro F1-score is: 0.6851034399591064
Micro F1-score is: 0.7837687604224569
Weighted F1-score is: 0.8022868791588753


# Coefficients

In [209]:
logreg_model = best_logistic_model.named_steps['logreg']

coefficients = logreg_model.coef_
intercept = logreg_model.intercept_

print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [[-1.17073184  0.21506618 -0.72984295  0.21296115  0.6579482  -0.63303477
  -0.19419866 -0.57281958 -0.5226519  -0.39386214 -0.34519563 -0.44684428
  -0.39053433 -0.41194703 -0.61439253 -0.529527   -0.58427962]]
Intercept: [-0.10220395]
