In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [2]:
df1 = pd.read_csv('https://raw.githubusercontent.com/BasiaSDA/projekt-klasyfikacja/main/df1.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/BasiaSDA/projekt-klasyfikacja/main/df2.csv')
df=df1.merge(df2)

In [3]:
df.drop(['Over18', 'StandardHours', 'EmployeeCount'], axis=1, inplace = True)
df = df[(df['TotalWorkingYears']<60) & (df['Age']<80) & (df['DistanceFromHome']<100) & (df['YearsAtCompany']<60) & (df['YearsInCurrentRole']<60) & (df['YearsSinceLastPromotion']<60) & (df['YearsWithCurrManager']<60)]
df = df.dropna()
df = df.reset_index(drop=True)
df = df.iloc[:, 1:].drop_duplicates().reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1755 non-null   float64
 1   BusinessTravel            1755 non-null   object 
 2   DailyRate                 1755 non-null   float64
 3   Department                1755 non-null   object 
 4   DistanceFromHome          1755 non-null   float64
 5   Education                 1755 non-null   float64
 6   EducationField            1755 non-null   object 
 7   EnvironmentSatisfaction   1755 non-null   float64
 8   Gender                    1755 non-null   object 
 9   HourlyRate                1755 non-null   float64
 10  JobInvolvement            1755 non-null   float64
 11  JobLevel                  1755 non-null   float64
 12  JobRole                   1755 non-null   object 
 13  JobSatisfaction           1755 non-null   float64
 14  MaritalS

In [6]:
df_copy = df[['Age', 'DistanceFromHome', 'EducationField',  'EnvironmentSatisfaction', 'JobInvolvement',  'JobLevel', 
                  'JobSatisfaction','MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'StockOptionLevel',
                 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'Attrition']]
df_copy = pd.get_dummies(df_copy, columns = ['EducationField', 'OverTime', 'Attrition'])
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              1755 non-null   float64
 1   DistanceFromHome                 1755 non-null   float64
 2   EnvironmentSatisfaction          1755 non-null   float64
 3   JobInvolvement                   1755 non-null   float64
 4   JobLevel                         1755 non-null   float64
 5   JobSatisfaction                  1755 non-null   float64
 6   MonthlyIncome                    1755 non-null   float64
 7   NumCompaniesWorked               1755 non-null   float64
 8   PercentSalaryHike                1755 non-null   float64
 9   StockOptionLevel                 1755 non-null   float64
 10  YearsAtCompany                   1755 non-null   float64
 11  YearsInCurrentRole               1755 non-null   float64
 12  YearsWithCurrManager

In [8]:
# Utworzenie zbioru testowego i treningowego
y = df_copy['Attrition_Yes'].values
df_copy1=df_copy.drop(['Attrition_Yes','Attrition_No'], axis=1)
X = df_copy1.values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [9]:
def calculate_metrics (X_train, X_test, y_train, y_test, model):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)

    F1_train = f1_score(y_train, y_pred_train)
    F1_test = f1_score(y_test, y_pred_test)

    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    wyniki = pd.DataFrame({'Accuracy':[acc_train, acc_test],
                      'F1':[F1_train, F1_test],
                      'Recall':[recall_train, recall_test]})
    wyniki.index=['train', 'test']
    return  wyniki

In [10]:
#regresja logistyczna
lr = LogisticRegression(max_iter=1000, random_state=30)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy        F1    Recall
train  0.673789  0.041841  0.021598
test   0.666667  0.048780  0.025000
       Accuracy        F1    Recall
train  0.824786  0.652542  0.498920
test   0.447293  0.170940  0.166667
       Accuracy        F1    Recall
train  0.695157  0.372434  0.274298
test   0.555556  0.161290  0.125000
       Accuracy   F1  Recall
train  0.670228  0.0     0.0
test   0.658120  0.0     0.0


In [11]:
# Kroswalidacja stratyfikowana
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=30, n_repeats=20, random_state=42)
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
#regresja logistyczna
lr = LogisticRegression(max_iter=100000, random_state=42)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy        F1    Recall
train  0.677077  0.118971  0.065603
test   0.689655  0.100000  0.052632
       Accuracy        F1    Recall
train  0.783736  0.531290  0.368794
test   0.500000  0.121212  0.105263
       Accuracy        F1    Recall
train  0.694755  0.314815  0.210993
test   0.534483  0.000000  0.000000
       Accuracy   F1  Recall
train  0.667649  0.0     0.0
test   0.672414  0.0     0.0


In [12]:
# K-fold
from sklearn.model_selection import KFold
kf = KFold(n_splits=20, shuffle=True, random_state=42)
y = df_copy['Attrition_Yes']
X = df_copy.drop(['Attrition_Yes','Attrition_No'], axis=1)
for train, test in kf.split(X, y):
    X_train, y_train = X.iloc[train], y.iloc[train]
    X_test, y_test = X.iloc[test], y.iloc[test]

#regresja logistyczna
lr = LogisticRegression(max_iter=100000, random_state=42)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy        F1    Recall
train  0.671463  0.061644  0.032491
test   0.689655  0.129032  0.068966
       Accuracy        F1    Recall
train  0.793765  0.567839  0.407942
test   0.436782  0.109091  0.103448
       Accuracy        F1    Recall
train  0.693645  0.324967  0.222022
test   0.540230  0.047619  0.034483
       Accuracy   F1  Recall
train  0.667866  0.0     0.0
test   0.666667  0.0     0.0


In [13]:
df_copya = df[["Age", 'JobLevel', 'MonthlyIncome', 'OverTime', 'StockOptionLevel', 'Attrition']]
df_copya = pd.get_dummies(df_copya, columns = ['OverTime', 'Attrition'])
df_copya.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               1755 non-null   float64
 1   JobLevel          1755 non-null   float64
 2   MonthlyIncome     1755 non-null   float64
 3   StockOptionLevel  1755 non-null   float64
 4   OverTime_No       1755 non-null   uint8  
 5   OverTime_Yes      1755 non-null   uint8  
 6   Attrition_No      1755 non-null   uint8  
 7   Attrition_Yes     1755 non-null   uint8  
dtypes: float64(4), uint8(4)
memory usage: 61.8 KB


In [14]:
# Utworzenie zbioru testowego i treningowego
y = df_copya['Attrition_Yes'].values
df_copy1=df_copya.drop(['Attrition_Yes','Attrition_No'], axis=1)
X = df_copy1.values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [15]:
#regresja logistyczna
lr = LogisticRegression(max_iter=1000, random_state=30)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy   F1  Recall
train  0.670228  0.0     0.0
test   0.658120  0.0     0.0
       Accuracy        F1    Recall
train  0.820513  0.652893  0.511879
test   0.467236  0.183406  0.175000
       Accuracy        F1    Recall
train  0.688034  0.352071  0.257019
test   0.572650  0.175824  0.133333
       Accuracy   F1  Recall
train  0.670228  0.0     0.0
test   0.658120  0.0     0.0


In [16]:
# Kroswalidacja stratyfikowana
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=30, n_repeats=20, random_state=42)
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
#regresja logistyczna
lr = LogisticRegression(max_iter=100000, random_state=42)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy       F1    Recall
train  0.668238  0.00354  0.001773
test   0.672414  0.00000  0.000000
       Accuracy        F1    Recall
train  0.782557  0.543881  0.390071
test   0.500000  0.000000  0.000000
       Accuracy        F1    Recall
train  0.696523  0.348925  0.244681
test   0.568966  0.074074  0.052632
       Accuracy   F1  Recall
train  0.667649  0.0     0.0
test   0.672414  0.0     0.0


In [17]:
# K-fold
from sklearn.model_selection import KFold
kf = KFold(n_splits=20, shuffle=True, random_state=42)
y = df_copy['Attrition_Yes']
X = df_copy.drop(['Attrition_Yes','Attrition_No'], axis=1)
for train, test in kf.split(X, y):
    X_train, y_train = X.iloc[train], y.iloc[train]
    X_test, y_test = X.iloc[test], y.iloc[test]

#regresja logistyczna
lr = LogisticRegression(max_iter=100000, random_state=42)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy        F1    Recall
train  0.671463  0.061644  0.032491
test   0.689655  0.129032  0.068966
       Accuracy        F1    Recall
train  0.793765  0.567839  0.407942
test   0.436782  0.109091  0.103448
       Accuracy        F1    Recall
train  0.693645  0.324967  0.222022
test   0.540230  0.047619  0.034483
       Accuracy   F1  Recall
train  0.667866  0.0     0.0
test   0.666667  0.0     0.0


In [18]:
df_copyb = df[['JobLevel', 'OverTime', 'StockOptionLevel', 'Attrition']]
df_copyb = pd.get_dummies(df_copyb, columns = ['OverTime', 'Attrition'])
df_copyb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   JobLevel          1755 non-null   float64
 1   StockOptionLevel  1755 non-null   float64
 2   OverTime_No       1755 non-null   uint8  
 3   OverTime_Yes      1755 non-null   uint8  
 4   Attrition_No      1755 non-null   uint8  
 5   Attrition_Yes     1755 non-null   uint8  
dtypes: float64(2), uint8(4)
memory usage: 34.4 KB


In [19]:
# Utworzenie zbioru testowego i treningowego
y = df_copya['Attrition_Yes'].values
df_copy1=df_copya.drop(['Attrition_Yes','Attrition_No'], axis=1)
X = df_copy1.values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [20]:
#regresja logistyczna
lr = LogisticRegression(max_iter=1000, random_state=30)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy   F1  Recall
train  0.670228  0.0     0.0
test   0.658120  0.0     0.0
       Accuracy        F1    Recall
train  0.819801  0.654843  0.518359
test   0.455840  0.173160  0.166667
       Accuracy        F1    Recall
train  0.688034  0.352071  0.257019
test   0.572650  0.175824  0.133333
       Accuracy   F1  Recall
train  0.670228  0.0     0.0
test   0.658120  0.0     0.0


In [21]:
# Kroswalidacja stratyfikowana
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=30, n_repeats=20, random_state=42)
for train_index, test_index in cv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
#regresja logistyczna
lr = LogisticRegression(max_iter=100000, random_state=42)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy       F1    Recall
train  0.668238  0.00354  0.001773
test   0.672414  0.00000  0.000000
       Accuracy        F1    Recall
train  0.782557  0.543881  0.390071
test   0.465517  0.000000  0.000000
       Accuracy        F1    Recall
train  0.696523  0.348925  0.244681
test   0.568966  0.074074  0.052632
       Accuracy   F1  Recall
train  0.667649  0.0     0.0
test   0.672414  0.0     0.0


In [22]:
# K-fold
from sklearn.model_selection import KFold
kf = KFold(n_splits=20, shuffle=True, random_state=42)
y = df_copy['Attrition_Yes']
X = df_copy.drop(['Attrition_Yes','Attrition_No'], axis=1)
for train, test in kf.split(X, y):
    X_train, y_train = X.iloc[train], y.iloc[train]
    X_test, y_test = X.iloc[test], y.iloc[test]

#regresja logistyczna
lr = LogisticRegression(max_iter=100000, random_state=42)
lr.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, lr))

#decision tree
clf = DecisionTreeClassifier(min_samples_split=4)
clf.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, clf))

#knn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, metric='chebyshev')
knn.fit(X_train, y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, knn))

#svm
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(calculate_metrics (X_train, X_test, y_train, y_test, svm))

       Accuracy        F1    Recall
train  0.671463  0.061644  0.032491
test   0.689655  0.129032  0.068966
       Accuracy        F1    Recall
train  0.793765  0.566751  0.406137
test   0.436782  0.109091  0.103448
       Accuracy        F1    Recall
train  0.693645  0.324967  0.222022
test   0.540230  0.047619  0.034483
       Accuracy   F1  Recall
train  0.667866  0.0     0.0
test   0.666667  0.0     0.0
