In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

### Класифікація в scikit-learn

**Мета роботи:** Ознайомитись з побудовою моделей для вирішення задачі
класифікації в scikit-learn, оцінкою та способами налаштування цих моделей.

**Завдання до лабораторної роботи**
Написати програму, яка навчає та тестує модель, що виконує задачу
бінарної класифікації відповідно до варіанту, оцінити модель за допомогою
відповідних метрик та спробувати її покращити.

Варіант 3: employee.csv. Визначити за допомогою класифікації чи піде
працівник з роботи (останній стовпець), використовуючи будь-які
незалежні змінні.

In [2]:
df = pd.read_csv('employee.csv')
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary,left
0,0.38,0.53,2,157,3,0,0,sales,low,1
1,0.80,0.86,5,262,6,0,0,sales,medium,1
2,0.11,0.88,7,272,4,0,0,sales,medium,1
3,0.72,0.87,5,223,5,0,0,sales,low,1
4,0.37,0.52,2,159,3,0,0,sales,low,1
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,0,support,low,1
14995,0.37,0.48,2,160,3,0,0,support,low,1
14996,0.37,0.53,2,143,3,0,0,support,low,1
14997,0.11,0.96,6,280,4,0,0,support,low,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   promotion_last_5years  14999 non-null  int64  
 7   department             14999 non-null  object 
 8   salary                 14999 non-null  object 
 9   left                   14999 non-null  int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [4]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,left
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.021268,0.238083
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.144281,0.425924
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [5]:
df[(df['left'] == 0)].left.count()

11428

In [6]:
df[(df['left'] == 1)].left.count()

3571

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

Simple linear regression

In [8]:
le = preprocessing.LabelEncoder()
df['department'] = le.fit_transform(df['department'])
df['salary'] = le.fit_transform(df['salary'])
X = df.drop('left', axis=1)
y = df['left']
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=0,stratify=y)

In [9]:
lr_pipe = Pipeline([('scale', StandardScaler()),('lr',
LogisticRegression(class_weight='balanced', random_state=0))])
lr_pipe.fit(X_train, y_train)
predicted = lr_pipe.predict(X_test)

In [10]:
print(lr_pipe.score(X_test, y_test))
print(classification_report(y_test, predicted))

0.7546666666666667
              precision    recall  f1-score   support

           0       0.92      0.74      0.82      2857
           1       0.49      0.79      0.60       893

    accuracy                           0.75      3750
   macro avg       0.70      0.77      0.71      3750
weighted avg       0.82      0.75      0.77      3750



The coefficient of determination `R^2` is defined as `(1-u/v)`, where `u` is the residual sum of squares ((y_true - y_pred)** 2).sum() and `v` is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). 

Linear regression with grid

In [11]:
pipeline = Pipeline([('scale', MinMaxScaler()),('lr',
LogisticRegression(class_weight='balanced',random_state=0))])
search_space = {'lr__C': np.logspace(-1, 1, num=10),'lr__fit_intercept': [True,
False]}
lr_grid = GridSearchCV(pipeline, search_space, scoring='f1_macro',
cv=5).fit(X_train, y_train)

In [12]:
print(lr_grid.score(X_test, y_test))
print(classification_report(y_test, lr_grid.predict(X_test)))

0.7271165077321997
              precision    recall  f1-score   support

           0       0.93      0.76      0.83      2857
           1       0.51      0.80      0.62       893

    accuracy                           0.77      3750
   macro avg       0.72      0.78      0.73      3750
weighted avg       0.83      0.77      0.78      3750



Decision tree 

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [14]:
pipeline = Pipeline([('scale',
StandardScaler()),('dt',DecisionTreeClassifier(random_state=0))])
pipeline.fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
search_space = {'max_depth': [5, 12],'min_samples_leaf': [5, 12]}
rf_grid = GridSearchCV(rf, search_space, cv=5, scoring='precision').fit(X_train,
y_train)
predicted = rf_grid.predict(X_test)

In [15]:
print(rf_grid.score(X_test, y_test))
print(classification_report(y_test, predicted))

0.9951573849878934
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2857
           1       1.00      0.92      0.96       893

    accuracy                           0.98      3750
   macro avg       0.99      0.96      0.97      3750
weighted avg       0.98      0.98      0.98      3750

