In [24]:
# import our modules
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier

In [31]:
# displaying all columns in the dataset
pd.set_option('display.max_columns', None)

#reading and displaying the dataset in pandas
data = pd.read_csv('Attrition.csv', low_memory=False)

# Sample and display 5 rows of our dataset
data.sample(5)

Unnamed: 0,id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
38,39,36,No,Travel_Rarely,852,Research & Development,5,4,Life Sciences,1,51,2,Female,82,2,1,Research Scientist,1,Married,3419,13072,9,Y,Yes,14,3,4,80,1,6,3,4,1,1,0,0
1351,1352,48,No,Travel_Frequently,117,Research & Development,22,3,Medical,1,1900,4,Female,58,3,4,Manager,4,Divorced,17174,2437,3,Y,No,11,3,2,80,1,24,3,3,22,17,4,7
1103,1104,48,No,Travel_Rarely,492,Sales,16,4,Life Sciences,1,1557,3,Female,96,3,2,Sales Executive,3,Divorced,6439,13693,8,Y,No,14,3,3,80,1,18,2,3,8,7,7,7
1339,1340,22,Yes,Travel_Rarely,391,Research & Development,7,1,Life Sciences,1,1878,4,Male,75,3,1,Research Scientist,2,Single,2472,26092,1,Y,Yes,23,4,1,80,0,1,2,3,1,0,0,0
436,437,33,Yes,Travel_Rarely,587,Research & Development,10,1,Medical,1,584,1,Male,38,1,1,Laboratory Technician,4,Divorced,3408,6705,7,Y,No,13,3,1,80,3,8,2,3,4,3,1,3


# Converting categorical range to binary

# We have to convert our age range to binary, in other to avoid errors when working on our model

In [32]:
# we make use of the map function to reassign the data in the column
data['Attrition'] = data.Attrition.map({'Yes' : 1,
                                    'No': 0})

# Sample and display 5 rows after the transformation
data.Attrition.sample(5)

309     0
27      0
204     1
1288    0
212     0
Name: Attrition, dtype: int64

In [33]:
# we make use of the map function to reassign the data in the column
data['OverTime'] = data.OverTime.map({'Yes' : 1,
                                    'No': 0})

# Sample and display 5 rows after the transformation
data.OverTime.sample(5)

1370    0
1209    0
550     0
319     1
109     0
Name: OverTime, dtype: int64

In [34]:
data.drop(columns=['id', 'EmployeeCount', 'EmployeeNumber'], inplace=True, axis=1)
data.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1404,42,0,Non-Travel,335,Research & Development,23,2,Life Sciences,4,Male,37,2,2,Research Scientist,3,Single,4332,14811,1,Y,0,12,3,4,80,0,20,2,3,20,9,3,7
67,45,0,Travel_Rarely,1339,Research & Development,7,3,Life Sciences,2,Male,59,3,3,Research Scientist,1,Divorced,9724,18787,2,Y,0,17,3,3,80,1,25,2,3,1,0,0,0
1353,34,1,Non-Travel,967,Research & Development,16,4,Technical Degree,4,Male,85,1,1,Research Scientist,1,Married,2307,14460,1,Y,1,23,4,2,80,1,5,2,3,5,2,3,0
1443,42,0,Travel_Rarely,300,Research & Development,2,3,Life Sciences,1,Male,56,3,5,Manager,3,Married,18880,17312,5,Y,0,11,3,1,80,0,24,2,2,22,6,4,14
1238,23,0,Travel_Rarely,160,Research & Development,4,1,Medical,3,Female,51,3,1,Laboratory Technician,2,Single,3295,12862,1,Y,0,13,3,3,80,0,3,3,1,3,2,1,2


In [35]:
data.describe()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,0.161224,802.485714,9.192517,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,6502.931293,14313.103401,2.693197,0.282993,15.209524,3.153741,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,0.367863,403.5091,8.106864,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,4707.956783,7117.786044,2.498009,0.450606,3.659938,0.360824,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,0.0,102.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,2094.0,0.0,0.0,11.0,3.0,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,465.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,2911.0,8047.0,1.0,0.0,12.0,3.0,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,0.0,802.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,4919.0,14235.5,2.0,0.0,14.0,3.0,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,0.0,1157.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,8379.0,20461.5,4.0,1.0,18.0,3.0,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1.0,1499.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,19999.0,26999.0,9.0,1.0,25.0,4.0,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [38]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [42]:
columns = ['Age', 'Attrition', 'DailyRate',
       'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobSatisfaction',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']
data[columns].sample(5)

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
312,31,0,192,2,4,3,32,3,1,4,2695,7747,0,1,18,3,2,80,1,3,2,1,2,2,2,2
1343,29,0,592,7,3,4,59,3,1,1,2062,19384,3,0,14,3,2,80,0,11,2,3,3,2,1,2
605,38,0,471,12,3,1,45,2,2,1,6288,4284,2,0,15,3,3,80,1,13,3,2,4,3,1,2
79,46,0,945,5,2,2,80,3,2,2,5021,10425,8,1,22,4,4,80,1,16,2,3,4,2,0,2
835,35,0,528,8,4,3,100,3,1,3,4323,7108,1,0,17,3,2,80,0,6,2,1,5,4,1,4


In [43]:
# Print the correlation matrix
data[columns].corr()


Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.159205,0.010661,-0.001686,0.208034,0.010146,0.024287,0.02982,0.509604,-0.004892,0.497855,0.028051,0.299635,0.028062,0.003634,0.001904,0.053535,,0.03751,0.680381,-0.019621,-0.02149,0.311309,0.212901,0.216513,0.202089
Attrition,-0.159205,1.0,-0.056652,0.077924,-0.031373,-0.103369,-0.006846,-0.130016,-0.169105,-0.103481,-0.15984,0.01517,0.043494,0.246118,-0.013478,0.002889,-0.045872,,-0.137145,-0.171063,-0.059478,-0.063939,-0.134392,-0.160545,-0.033019,-0.156199
DailyRate,0.010661,-0.056652,1.0,-0.004985,-0.016806,0.018355,0.023381,0.046135,0.002966,0.030571,0.007707,-0.032182,0.038153,0.009135,0.022704,0.000473,0.007846,,0.042143,0.014515,0.002453,-0.037848,-0.034055,0.009932,-0.033229,-0.026363
DistanceFromHome,-0.001686,0.077924,-0.004985,1.0,0.021042,-0.016075,0.031131,0.008783,0.005303,-0.003669,-0.017014,0.027473,-0.029251,0.025514,0.040235,0.02711,0.006557,,0.044872,0.004628,-0.036942,-0.026556,0.009508,0.018845,0.010029,0.014406
Education,0.208034,-0.031373,-0.016806,0.021042,1.0,-0.027128,0.016775,0.042438,0.101589,-0.011296,0.094961,-0.026084,0.126317,-0.020322,-0.011111,-0.024539,-0.009118,,0.018422,0.14828,-0.0251,0.009819,0.069114,0.060236,0.054254,0.069065
EnvironmentSatisfaction,0.010146,-0.103369,0.018355,-0.016075,-0.027128,1.0,-0.049857,-0.008278,0.001212,-0.006784,-0.006259,0.0376,0.012594,0.070132,-0.031701,-0.029548,0.007665,,0.003432,-0.002693,-0.019359,0.027627,0.001458,0.018007,0.016194,-0.004999
HourlyRate,0.024287,-0.006846,0.023381,0.031131,0.016775,-0.049857,1.0,0.042861,-0.027853,-0.071335,-0.015794,-0.015297,0.022157,-0.007782,-0.009062,-0.002172,0.00133,,0.050263,-0.002334,-0.008548,-0.004607,-0.019582,-0.024106,-0.026716,-0.020123
JobInvolvement,0.02982,-0.130016,0.046135,0.008783,0.042438,-0.008278,0.042861,1.0,-0.01263,-0.021476,-0.015271,-0.016322,0.015012,-0.003507,-0.017205,-0.029071,0.034297,,0.021523,-0.005533,-0.015338,-0.014617,-0.021355,0.008717,-0.024184,0.025976
JobLevel,0.509604,-0.169105,0.002966,0.005303,0.101589,0.001212,-0.027853,-0.01263,1.0,-0.001944,0.9503,0.039563,0.142501,0.000544,-0.03473,-0.021222,0.021642,,0.013984,0.782208,-0.018191,0.037818,0.534739,0.389447,0.353885,0.375281
JobSatisfaction,-0.004892,-0.103481,0.030571,-0.003669,-0.011296,-0.006784,-0.071335,-0.021476,-0.001944,1.0,-0.007157,0.000644,-0.055699,0.024539,0.020002,0.002297,-0.012454,,0.01069,-0.020185,-0.005779,-0.019459,-0.003803,-0.002305,-0.018214,-0.027656


In [48]:
df = data[columns]
# Separate features (X) and target variable (y)
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
lr_predictions = lr.predict(X_test)
print("Logistic Regression Accuracy:", lr_predictions)

Logistic Regression Accuracy: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


In [67]:
lr_accuracy = accuracy_score(lr_predictions, y_test)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.8707482993197279


In [49]:
# Method 1: Correlation Analysis
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix['Attrition'].abs().sort_values(ascending=False)
selected_features_corr = correlation_with_target[1:].index.tolist()  # Select top 5 correlated features
print("Selected features using correlation analysis:", selected_features_corr)


Selected features using correlation analysis: ['OverTime', 'TotalWorkingYears', 'JobLevel', 'YearsInCurrentRole', 'MonthlyIncome', 'Age', 'YearsWithCurrManager', 'StockOptionLevel', 'YearsAtCompany', 'JobInvolvement', 'JobSatisfaction', 'EnvironmentSatisfaction', 'DistanceFromHome', 'WorkLifeBalance', 'TrainingTimesLastYear', 'DailyRate', 'RelationshipSatisfaction', 'NumCompaniesWorked', 'YearsSinceLastPromotion', 'Education', 'MonthlyRate', 'PercentSalaryHike', 'HourlyRate', 'PerformanceRating', 'StandardHours']


In [61]:
# Method 2: Feature Importance with Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
feature_importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
selected_features_rf = feature_importances[1:].index.tolist()  # Select top 5 important features
print("Selected features using feature importance (Random Forest):", selected_features_rf)

Selected features using feature importance (Random Forest): ['OverTime', 'DailyRate', 'Age', 'MonthlyRate', 'TotalWorkingYears', 'HourlyRate', 'DistanceFromHome', 'YearsAtCompany', 'PercentSalaryHike', 'StockOptionLevel', 'NumCompaniesWorked', 'YearsWithCurrManager', 'YearsSinceLastPromotion', 'JobSatisfaction', 'EnvironmentSatisfaction', 'TrainingTimesLastYear', 'JobInvolvement', 'RelationshipSatisfaction', 'YearsInCurrentRole', 'WorkLifeBalance', 'JobLevel', 'Education', 'PerformanceRating', 'StandardHours']


In [60]:
sfm_selector = SelectFromModel(rf, threshold='median')
X_train_sfm = sfm_selector.fit_transform(X_train, y_train)
X_test_sfm = sfm_selector.transform(X_test)