In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [8]:
df = pd.read_csv("D:\IBM_Employee_Attrition.csv")

df["Attrition"] = df.apply(lambda row: 1 if row["Attrition"] == "Yes" else 0, axis=1)
df["BusinessTravel"] = df.apply(lambda row:1 if row["BusinessTravel"] == "Travel_Frequently" \
                                else 0, axis=1)
df["Department_Sales"] = df.apply(lambda row:1 if row["Department"] == "Sales" else 0, axis=1)
df["Department_RD"] = df.apply(lambda row:1 if row["Department"] == "Research & Development" \
                               else 0, axis=1)
df["Education"] = df.apply(lambda row: 1 if row["Education"] >= 3 else 0, axis=1)
df["EducationField"] = df.apply(lambda row: 1 if row["EducationField"] == "Life Sciences" \
                                or row["EducationField"] == "Medical" \
                                or row["EducationField"] == "Technical Degree" else 0, axis = 1)
df["Gender"] = df.apply(lambda row: 1 if row["Gender"] == "Male" else 0, axis=1)
df["JobInvolvement"] = df.apply(lambda row: 1 if row["JobInvolvement"] >= 3 else 0, axis=1)
df["JobLevel"] = df.apply(lambda row: 1 if row["JobLevel"] >= 3 else 0, axis=1)
df["JobRole"] = df.apply(lambda row:1 if row["JobRole"] == "Laboratory Technician" \
                         or row["JobRole"] == "Research Scientist" else 0, axis=1)
df["MaritalStatus"] = df.apply(lambda row:1 if row["MaritalStatus"] == "Married" else 0, axis=1)
df["OverTime"] = df.apply(lambda row:1 if row["OverTime"] == "Yes" else 0, axis=1)
df["StockOptionLevel"] = df.apply(lambda row:1 if row["StockOptionLevel"] != 0 else 0, axis=1)

In [9]:
features = df[["Age", "BusinessTravel", "Department_Sales", "Department_RD", \
               "DistanceFromHome", "Education", "EducationField", "EnvironmentSatisfaction", \
               "Gender", "JobInvolvement", "JobLevel", "JobRole", "JobSatisfaction", \
               "MaritalStatus", "MonthlyIncome", "NumCompaniesWorked", "OverTime", \
               "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", \
               "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", \
               "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion", \
               "YearsWithCurrManager"]]
attrition = df["Attrition"]
print(features.head(3))
print(attrition.iloc[:3])

   Age  BusinessTravel  Department_Sales  Department_RD  DistanceFromHome  \
0   41               0                 1              0                 1   
1   49               1                 0              1                 8   
2   37               0                 0              1                 2   

   Education  EducationField  EnvironmentSatisfaction  Gender  JobInvolvement  \
0          0               1                        2       0               1   
1          0               1                        3       1               0   
2          0               0                        4       1               0   

   ...  PerformanceRating  RelationshipSatisfaction  StockOptionLevel  \
0  ...                  3                         1                 0   
1  ...                  4                         4                 1   
2  ...                  3                         2                 0   

   TotalWorkingYears  TrainingTimesLastYear  WorkLifeBalance  YearsAtComp

In [10]:
features_train, features_test, attrition_train, attrition_test = train_test_split(
    features, attrition, train_size = 0.8, test_size = 0.2, random_state = 6)

scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)

model = LogisticRegression()
model.fit(features_train, attrition_train)

print(model.coef_)
print("Score on training data: {:.4f}".format(model.score(features_train, attrition_train)))
print("Score on testing data: {:.4f}".format(model.score(features_test, attrition_test)))

[[-0.30157384  0.3909272   0.1715615  -0.51722404  0.29784888  0.04037221
  -0.10193076 -0.46091497  0.07950266 -0.27880144  0.73481177  0.4646925
  -0.36736596  0.04953749 -0.80740168  0.46660579  0.86897691 -0.04828101
  -0.02460812 -0.25525894 -0.5968087  -0.77609821 -0.24614585 -0.21273811
   0.75640439 -0.56080853  0.43735691 -0.43187875]]
Score on training data: 0.8920
Score on testing data: 0.8741


In [11]:
Alan = np.array([24.0, 0.0, 0.0, 1.0, 10.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 4.0, 0.0, 2500.0, \
                 1.0, 1.0, 0.0, 2.0, 2.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0])
Ben = np.array([49.0, 1.0, 1.0, 0.0, 15.0, 1.0, 0.0, 4.0, 1.0, 3.0, 1.0, 1.0, 3.0, 0.0, 3500.0, \
                3.0, 1.0, 5.0, 2.0, 4.0, 0.0, 25.0, 2.0, 2.0, 15.0, 15.0, 8.0, 3.0])
Chloe = np.array([60.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 4.0, 1.0, 4800.0, \
                  2.0, 1.0, 1.0, 3.0, 4.0, 0.0, 37.0, 3.0, 4.0, 30.0, 20.0, 10.0, 20.0])
Denise = np.array([40.0, 1.0, 1.0, 0.0, 12.0, 1.0, 0.0, 4.0, 0.0, 1.0, 1.0, 0.0, 4.0, 1.0, 8000.0, \
                   0.0, 0.0, 10.0, 4.0, 3.0, 1.0, 18.0, 2.0, 2.0, 18.0, 9.0, 7.0, 3.0])
sample_employees = np.array([Alan, Ben, Chloe, Denise])
sample_employees = scaler.fit_transform(sample_employees)
print(model.predict(sample_employees))
print(model.predict_proba(sample_employees))

[1 1 0 0]
[[0.41917275 0.58082725]
 [0.2470677  0.7529323 ]
 [0.99821686 0.00178314]
 [0.99642046 0.00357954]]
