# Обучение модели

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt


In [35]:
train_df = pd.read_csv('train.csv')

In [36]:
train_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2013,Pune,3,27,Male,No,5,0
1,Bachelors,2016,Pune,2,24,Female,No,2,1
2,Bachelors,2013,Pune,3,32,Male,No,2,0
3,PHD,2016,Bangalore,3,30,Female,No,5,0
4,Bachelors,2013,Bangalore,3,25,Male,No,3,0


In [37]:
label_encoder = LabelEncoder()

for column in train_df.columns:
    if train_df[column].dtype == 'object':
        train_df[column] = label_encoder.fit_transform(train_df[column])

In [38]:
train_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,0,2013,2,3,27,1,0,5,0
1,0,2016,2,2,24,0,0,2,1
2,0,2013,2,3,32,1,0,2,0
3,2,2016,0,3,30,0,0,5,0
4,0,2013,0,3,25,1,0,3,0


In [39]:
X, y = train_df.drop(['LeaveOrNot'], axis=1).to_numpy(), train_df['LeaveOrNot'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
tree = DecisionTreeClassifier(random_state=42, min_samples_leaf=33, max_leaf_nodes=12)

In [41]:
tree.fit(X_train, y_train)

# Submission

In [42]:
test_df = pd.read_csv('test.csv')

In [43]:
for column in test_df.columns:
    if test_df[column].dtype == 'object':
        test_df[column] = label_encoder.fit_transform(test_df[column])

In [44]:
pred = tree.predict(test_df.to_numpy())

In [45]:
d = pd.DataFrame({'ID': list(range(len(pred))), 'LeaveOrNot': pred})
d.to_csv('submission.csv', index=False)

In [None]:
#Проверим модель по 4 метрикам
y_prob_decision_tree = tree.predict_proba(X_test)[:, 1]
y_pred_decision_tree = tree.predict(X_test)

accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
roc_auc_decision_tree = roc_auc_score(y_test, y_prob_decision_tree)
precision_decision_tree = precision_score(y_test, y_pred_decision_tree)
recall_decision_tree = recall_score(y_test, y_pred_decision_tree)

print(f'Accuracy of Decision Tree: {accuracy_decision_tree}')
print(f'ROC AUC of Decision Tree: {roc_auc_decision_tree}')
print(f'Precision of Decision Tree: {precision_decision_tree}')
print(f'Recall of Decision Tree: {recall_decision_tree}')




Accuracy of Decision Tree: 0.832378223495702
ROC AUC of Decision Tree: 0.8506868631732168
Precision of Decision Tree: 0.9072847682119205
Recall of Decision Tree: 0.5708333333333333
