In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [2]:
# Load dataset
data = pd.read_csv("HR-EmployeeAttrition.csv")
features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
            'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
            'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
            'MaritalStatus', 'MonthlyIncome',
            'OverTime','RelationshipSatisfaction',
            'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
            'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

# Separate dataset into features and target variable
X = data[features]
y = data['Attrition']

print(X.head())

   Age     BusinessTravel              Department  DistanceFromHome  \
0   30      Travel_Rarely  Research & Development                 1   
1   48  Travel_Frequently  Research & Development                 4   
2   27      Travel_Rarely  Research & Development                 1   
3   34      Travel_Rarely                   Sales                28   
4   23  Travel_Frequently                   Sales                 9   

   Education EducationField  EnvironmentSatisfaction  Gender  JobInvolvement  \
0          2        Medical                        4    Male               3   
1          5        Medical                        3    Male               2   
2          2        Medical                        4    Male               3   
3          3      Marketing                        4  Female               2   
4          3      Marketing                        4    Male               3   

   JobLevel  ... OverTime  RelationshipSatisfaction StockOptionLevel  \
0         1  ...    

In [3]:
import plotly.express as px

def cat_summary_plotly(dataframe, col_name):
    summary_df = dataframe[col_name].value_counts().reset_index()
    summary_df.columns = ['value', 'count']
    summary_df['percentage'] = 100 * summary_df['count'] / len(dataframe)
    summary_df['percentage'] = summary_df['percentage'].round(2)  # 2 decimal places

    fig = px.bar(summary_df, x='value', y='count', text='percentage',
                 title=f'{col_name} Count and Percentage',
                 labels={'value': col_name, 'count': 'Count', 'percentage': 'Percentage'})
    fig.show()

In [28]:
# Display descriptive statistics of 'Age' column
print("Descriptive statistics of 'Age' column:")
print(data['Age'].describe())

# Categorize 'Age' column into 4 categories
data['Age_Category'] = pd.cut(data['Age'], bins=[-1, 25, 37, 47, data['Age'].max()], labels=['0-25', '26-37', '38-47', f'48-{data["Age"].max()}'])

# Display data with 'Age' and 'Age_Category' columns
print("\nData with 'Age' and 'Age_Category' columns:")
print(data[['Age', 'Age_Category']].head(10))

# Display count and percentage of each category in 'Age_Category' column
cat_summary_plotly(data, 'Age_Category')

Descriptive statistics of 'Age' column:
count    1323.000000
mean       36.867725
std         9.130830
min        18.000000
25%        30.000000
50%        36.000000
75%        43.000000
max        60.000000
Name: Age, dtype: float64

Data with 'Age' and 'Age_Category' columns:
   Age Age_Category
0   30        26-37
1   48        48-60
2   27        26-37
3   34        26-37
4   23         0-25
5   29        26-37
6   45        38-47
7   29        26-37
8   28        26-37
9   36        26-37


In [40]:
# Display descriptive statistics of 'StockOptionLevel' column
print("Descriptive statistics of 'StockOptionLevel' column:")
print(data['StockOptionLevel'].describe())

# Categorize stock option level based on custom bins
data['StockOptionLevel_Category'] = pd.cut(data['StockOptionLevel'], bins=[0, 1.5, 3.5], labels=['Group1-2', 'Group3-4'], right=False)


# Display data with 'StockOptionLevel' and 'StockOptionLevelGroup' columns
print("\nData with 'StockOptionLevel' and 'StockOptionLevel_Category' columns:")
print(data[['StockOptionLevel', 'StockOptionLevel_Category']].head(10))

# Display count and percentage of each category in 'StockOptionLevelGroup' column
cat_summary_plotly(data, 'StockOptionLevel_Category')


Descriptive statistics of 'StockOptionLevel' column:
count    1323.000000
mean        0.803477
std         0.859627
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: StockOptionLevel, dtype: float64

Data with 'StockOptionLevel' and 'StockOptionLevel_Category' columns:
   StockOptionLevel StockOptionLevel_Category
0                 0                  Group1-2
1                 1                  Group1-2
2                 1                  Group1-2
3                 2                  Group3-4
4                 1                  Group1-2
5                 2                  Group3-4
6                 2                  Group3-4
7                 1                  Group1-2
8                 0                  Group1-2
9                 1                  Group1-2


In [30]:
# Display descriptive statistics of 'DistancefromHome' column
print("Descriptive statistics of 'DistancefromHome' column:")
print(data['DistanceFromHome'].describe())

# Categorize 'DistanceFromHome' into 'Near' and 'Far' categories
data['Distance_Category'] = pd.cut(data['DistanceFromHome'], bins=[-1, 10, data['DistanceFromHome'].max()], labels=['Near', 'Far'])

# Display data with 'DistanceFromHome' and 'Distance_Category' columns
print("\nData with 'DistanceFromHome' and 'Distance_Category' columns:")
print(data[['DistanceFromHome', 'Distance_Category']])

# Display count and percentage of each category in 'Distance_Category' column
cat_summary_plotly(data, 'Distance_Category')


Descriptive statistics of 'DistancefromHome' column:
count    1323.000000
mean        9.195011
std         8.110139
min         1.000000
25%         2.000000
50%         7.000000
75%        14.000000
max        29.000000
Name: DistanceFromHome, dtype: float64

Data with 'DistanceFromHome' and 'Distance_Category' columns:
      DistanceFromHome Distance_Category
0                    1              Near
1                    4              Near
2                    1              Near
3                   28               Far
4                    9              Near
...                ...               ...
1318                15               Far
1319                10              Near
1320                24               Far
1321                 2              Near
1322                 8              Near

[1323 rows x 2 columns]


In [41]:
# Display descriptive statistics of 'MonthlyIncome' column
print("Descriptive statistics of 'MonthlyIncome' column:")
print(data['MonthlyIncome'].describe())

# Categorize 'MonthlyIncome' into 5 categories
data['MonthlyIncome_Category'] = pd.cut(data['MonthlyIncome'], bins=[-1, 2000, 3000, 7000, 11000, data['MonthlyIncome'].max()], labels=['0-2000', '2001-3000', '3001-7000', '7001-11000', f'11001-{data["MonthlyIncome"].max()}'])

# Display data with 'MonthlyIncome' and 'Income_Category' columns
print("\nData with 'MonthlyIncome' and 'MonthlyIncome_Category' columns:")
print(data[['MonthlyIncome', 'MonthlyIncome_Category']].head(10))

# Display count and percentage of each category in 'Income_Category' column
cat_summary_plotly(data, 'MonthlyIncome_Category')

Descriptive statistics of 'MonthlyIncome' column:
count     1323.000000
mean      6517.230537
std       4722.151172
min       1009.000000
25%       2918.500000
50%       4898.000000
75%       8487.000000
max      19999.000000
Name: MonthlyIncome, dtype: float64

Data with 'MonthlyIncome' and 'MonthlyIncome_Category' columns:
   MonthlyIncome MonthlyIncome_Category
0           3748              3001-7000
1          15202            11001-19999
2           3816              3001-7000
3           6712              3001-7000
4           1790                 0-2000
5           6384              3001-7000
6           9380             7001-11000
7           2703              2001-3000
8           3485              3001-7000
9           5562              3001-7000


In [32]:

# Display descriptive statistics of 'TotalWorkingYears' column
print("Descriptive statistics of 'TotalWorkingYears' column:")
print(data['TotalWorkingYears'].describe())

# Categorize 'TotalWorkingYears' into '0-3 Years', '4-10 Years', '11-25 Years', '26+ Years' categories
max_working_years = data['TotalWorkingYears'].max()
data['TotalWorkingYears_Category'] = pd.cut(data['TotalWorkingYears'], bins=[-1, 3, 10, 25, max_working_years], labels=['0-3 Years', '4-10 Years', '11-25 Years',  f'26-{max_working_years} Years'])

# Display data with 'TotalWorkingYears' and 'TotalWorkingYears_Category' columns
print("\nData with 'TotalWorkingYears' and 'TotalWorkingYears_Category' columns:")
print(data[['TotalWorkingYears', 'TotalWorkingYears_Category']].head(10))

# Display count and percentage of each category in 'TotalWorkingYears_Category' column
cat_summary_plotly(data, 'TotalWorkingYears_Category')


Descriptive statistics of 'TotalWorkingYears' column:
count    1323.000000
mean       11.331066
std         7.827035
min         0.000000
25%         6.000000
50%        10.000000
75%        15.000000
max        40.000000
Name: TotalWorkingYears, dtype: float64

Data with 'TotalWorkingYears' and 'TotalWorkingYears_Category' columns:
   TotalWorkingYears TotalWorkingYears_Category
0                 12                11-25 Years
1                 23                11-25 Years
2                  5                 4-10 Years
3                  8                 4-10 Years
4                  1                  0-3 Years
5                 11                11-25 Years
6                 10                 4-10 Years
7                  6                 4-10 Years
8                  5                 4-10 Years
9                  9                 4-10 Years


In [33]:
# Display descriptive statistics of 'YearsAtCompany' column
print("Descriptive statistics of 'YearsAtCompany' column:")
print(data['YearsAtCompany'].describe())

# Categorize 'YearsAtCompany' into '0-5', '6-11', '11-max' categories
data['YearsAtCompany_Category'] = pd.cut(data['YearsAtCompany'], bins=[-1, 5, 11, data['YearsAtCompany'].max()], labels=['0-5', '6-11', f'11-{data["YearsAtCompany"].max()}'])

# Display data with 'YearsAtCompany' and 'YearsAtCompany_Category' columns
print("\nData with 'YearsAtCompany' and 'YearsAtCompany_Category' columns:")
print(data[['YearsAtCompany', 'YearsAtCompany_Category']].head(10))

# Display count and percentage of each category in 'YearsAtCompany_Category' column
cat_summary_plotly(data, 'YearsAtCompany_Category')


Descriptive statistics of 'YearsAtCompany' column:
count    1323.000000
mean        7.048375
std         6.154153
min         0.000000
25%         3.000000
50%         5.000000
75%        10.000000
max        40.000000
Name: YearsAtCompany, dtype: float64

Data with 'YearsAtCompany' and 'YearsAtCompany_Category' columns:
   YearsAtCompany YearsAtCompany_Category
0              12                   11-40
1               2                     0-5
2               5                     0-5
3               8                    6-11
4               1                     0-5
5               7                    6-11
6               3                     0-5
7               5                     0-5
8               0                     0-5
9               3                     0-5


In [34]:
# Display descriptive statistics of 'YearsInCurrentRole' column
print("Descriptive statistics of 'YearsInCurrentRole' column:")
print(data['YearsInCurrentRole'].describe())

# Categorize 'YearsInCurrentRole' into '0-2', '3-4', '5-6', '7-9', '9-max' categories
data['YearsInCurrentRole_Category'] = pd.cut(data['YearsInCurrentRole'], bins=[-1, 2, 4, 6, 9, data['YearsInCurrentRole'].max()], labels=['0-2', '3-4', '5-6', '7-9', f'9-{data["YearsInCurrentRole"].max()}'])

# Display data with 'YearsInCurrentRole' and 'YearsInCurrentRole_Category' columns
print("\nData with 'YearsInCurrentRole' and 'YearsInCurrentRole_Category' columns:")
print(data[['YearsInCurrentRole', 'YearsInCurrentRole_Category']].head(10))

# Display count and percentage of each category in 'YearsInCurrentRole_Category' column
cat_summary_plotly(data, 'YearsInCurrentRole_Category')


Descriptive statistics of 'YearsInCurrentRole' column:
count    1323.000000
mean        4.256992
std         3.639860
min         0.000000
25%         2.000000
50%         3.000000
75%         7.000000
max        18.000000
Name: YearsInCurrentRole, dtype: float64

Data with 'YearsInCurrentRole' and 'YearsInCurrentRole_Category' columns:
   YearsInCurrentRole YearsInCurrentRole_Category
0                   8                         7-9
1                   2                         0-2
2                   2                         0-2
3                   7                         7-9
4                   0                         0-2
5                   0                         0-2
6                   1                         0-2
7                   4                         3-4
8                   0                         0-2
9                   2                         0-2


In [35]:
# # Years Since Last Promotion

# # Display descriptive statistics of 'YearsSinceLastPromotion' column
# print("Descriptive statistics of 'YearsSinceLastPromotion' column:")
# print(data['YearsSinceLastPromotion'].describe())

# # Categorize 'YearsSinceLastPromotion' into 'Low', 'Medium', 'High' categories
# data['YearsSinceLastPromotion_Category'] = pd.qcut(data['YearsSinceLastPromotion'], q=[0, 0.33, 0.66, 1.0], labels=['Low', 'Medium', 'High'])

# # Display data with 'YearsSinceLastPromotion' and 'YearsSinceLastPromotion_Category' columns
# print("\nData with 'YearsSinceLastPromotion' and 'YearsSinceLastPromotion_Category' columns:")
# print(data[['YearsSinceLastPromotion', 'YearsSinceLastPromotion_Category']].head(10))


In [36]:
# Years With Current Manager

# Display descriptive statistics of 'YearsWithCurrManager' column
print("Descriptive statistics of 'YearsWithCurrManager' column:")
print(data['YearsWithCurrManager'].describe())

# Categorize 'YearsWithCurrManager' into '-1-1', '2-2', '3-4', '5-7', '8-max' categories
data['YearsWithCurrManager_Category'] = pd.cut(data['YearsWithCurrManager'], bins=[-1, 2, 7, data['YearsWithCurrManager'].max()], labels=['0-2', '3-7', f'8-{data["YearsWithCurrManager"].max()}'])

# Display data with 'YearsWithCurrManager' and 'YearsWithCurrManager_Category' columns
print("\nData with 'YearsWithCurrManager' and 'YearsWithCurrManager_Category' columns:")
print(data[['YearsWithCurrManager', 'YearsWithCurrManager_Category']].head(10))

# Display count and percentage of each category in 'YearsWithCurrManager_Category' column
cat_summary_plotly(data, 'YearsWithCurrManager_Category')

Descriptive statistics of 'YearsWithCurrManager' column:
count    1323.000000
mean        4.164777
std         3.596317
min         0.000000
25%         2.000000
50%         3.000000
75%         7.000000
max        17.000000
Name: YearsWithCurrManager, dtype: float64

Data with 'YearsWithCurrManager' and 'YearsWithCurrManager_Category' columns:
   YearsWithCurrManager YearsWithCurrManager_Category
0                     7                           3-7
1                     2                           0-2
2                     4                           3-7
3                     7                           3-7
4                     0                           0-2
5                     6                           3-7
6                     2                           0-2
7                     4                           3-7
8                     0                           0-2
9                     2                           0-2


In [38]:
# # Display original categorical columns
# print("Orjinal categorical variables:")
# print(X.select_dtypes(include=['object']).head())

# Process categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    X.loc[:, col] = label_encoders[col].fit_transform(X[col])

# # Display transformed categorical columns
# print("\nTransformed categorical variables:")
# print(X[categorical_cols].head())

In [39]:
attributes_to_print = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
            'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
            'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
            'MaritalStatus', 'MonthlyIncome',
            'OverTime','RelationshipSatisfaction',
            'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
            'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition']


def extract_data(point):
    return {attr: getattr(point, attr) for attr in attributes_to_print}

len_data = len(data)
for i in range(5):
    rand = random.randint(0, len_data - 1)
    point = data.iloc[rand]
    print(", ".join([f"{attr}: {point[attr]}" for attr in attributes_to_print]))


Age: 42, BusinessTravel: Travel_Frequently, Department: Research & Development, DistanceFromHome: 5, Education: 2, EducationField: Other, EnvironmentSatisfaction: 2, Gender: Male, JobInvolvement: 3, JobLevel: 1, JobRole: Laboratory Technician, JobSatisfaction: 3, MaritalStatus: Married, MonthlyIncome: 2093, OverTime: No, RelationshipSatisfaction: 4, StockOptionLevel: 1, TotalWorkingYears: 8, TrainingTimesLastYear: 4, WorkLifeBalance: 3, YearsAtCompany: 2, YearsInCurrentRole: 2, YearsSinceLastPromotion: 2, YearsWithCurrManager: 0, Attrition: No
Age: 55, BusinessTravel: Travel_Rarely, Department: Research & Development, DistanceFromHome: 1, Education: 3, EducationField: Medical, EnvironmentSatisfaction: 4, Gender: Male, JobInvolvement: 3, JobLevel: 5, JobRole: Manager, JobSatisfaction: 1, MaritalStatus: Single, MonthlyIncome: 19045, OverTime: Yes, RelationshipSatisfaction: 3, StockOptionLevel: 0, TotalWorkingYears: 37, TrainingTimesLastYear: 2, WorkLifeBalance: 3, YearsAtCompany: 36, Yea

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Logistic Regression model
model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=5000)

# Train the model
model.fit(X_train, y_train)

# Predict the target attribute
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy of the model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the accuracy of the model
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

Train Accuracy: 0.8686200378071833
Test Accuracy: 0.8679245283018868
