**|RUS|** Данные взяты из 
>https://www.kaggle.com/uciml/student-alcohol-consumption

Здесь используется метод деревьев решений для классификации сдал/не сдал.

----

**|ENG|** Data from 
>https://www.kaggle.com/uciml/student-alcohol-consumption

Here we use the decision tree method for classifying pass/not pass.

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


In [0]:
from matplotlib import rcParams
rcParams['figure.figsize'] = (12, 8) 

In [0]:
# Один файл для обучения, второй для теста
# One file for training, the second for the test

studentData_test = pd.read_csv('student-mat.csv')
studentData_train = pd.read_csv('student-por.csv')

In [4]:
studentData_train

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,course,mother,1,3,1,no,no,no,yes,no,yes,yes,no,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,course,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,course,mother,2,2,0,no,no,no,yes,yes,yes,no,no,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,6,10,10,10


In [5]:
# Много признаков содержат слова + ответ будет зависеть от g1, g2, g3
# Преобразуем данные
# Many features contain words + the answer will depend on g1, g2, g3
# Transform data

label_encoder = LabelEncoder()
categorical_columns_train = studentData_train.columns[studentData_train.dtypes == 'object']
for column in categorical_columns_train:
    studentData_train[column] = label_encoder.fit_transform(studentData_train[column])


# Каждый ученик может набрать в сумме 60 баллов. Определим границу 'сдал/не сдал'
# следующим образом: если (G1 + G2 + G3) / 60 >= 0.6, то значит сдал
# В противном случае не сдал
# Each student can score a total of 60 points. We define the boundary 
# 'passed / not passed' as follows: if (G1 + G2 + G3) / 60> = 0.6, then it means passed
# Otherwise student didn’t pass

pass_exam_train = np.zeros_like(studentData_train.G3)
sumGi_train = (studentData_train.G3 + studentData_train.G2 + studentData_train.G1) / 60
for i in range(len(pass_exam_train)):
  if(sumGi_train[i] >= 0.6):
    pass_exam_train[i] = 1
  else:
    pass_exam_train[i] = 0


# Столбцы 'school', 'address', 'G1', 'G2', 'G3' не нужны
# Columns 'school', 'address', 'G1', 'G2', 'G3' are not needed

del studentData_train['school']
del studentData_train['address']
del studentData_train['G1']
del studentData_train['G2']
del studentData_train['G3']
studentData_train['Pass'] = pd.Series(pass_exam_train)

studentData_train

Unnamed: 0,sex,age,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,Pass
0,0,18,0,0,4,4,0,4,0,1,2,2,0,1,0,0,0,1,1,0,0,4,3,4,1,1,3,4,0
1,0,17,0,1,1,1,0,2,0,0,1,2,0,0,1,0,0,0,1,1,0,5,3,3,1,1,3,2,0
2,0,15,1,1,1,1,0,2,2,1,1,2,0,1,0,0,0,1,1,1,0,4,3,2,2,3,3,6,1
3,0,15,0,1,4,2,1,3,1,1,1,3,0,0,1,0,1,1,1,1,1,3,2,2,1,1,5,0,1
4,0,16,0,1,3,3,2,2,1,0,1,2,0,0,1,0,0,1,1,0,0,4,3,2,1,2,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,0,19,0,1,2,3,3,2,0,1,1,3,1,0,0,0,1,0,1,1,0,5,4,2,1,2,5,4,0
645,0,18,1,1,3,1,4,3,0,1,1,2,0,0,1,0,0,1,1,1,0,4,3,4,1,1,1,4,1
646,0,18,0,1,1,1,2,2,0,1,2,2,0,0,0,0,1,1,1,0,0,1,1,1,1,1,5,6,0
647,1,17,1,1,3,1,3,3,0,1,2,1,0,0,0,0,0,0,1,1,0,2,4,5,3,4,2,6,0


In [6]:
# Сделаем такие же преобразования для тестовой выборки
# Let's make the same transformations for the test sample

vlabel_encoder = LabelEncoder()
categorical_columns_test = studentData_test.columns[studentData_test.dtypes == 'object']
for column in categorical_columns_test:
    studentData_test[column] = label_encoder.fit_transform(studentData_test[column])

pass_exam_test = np.zeros_like(studentData_test.G3)
sumGi_test = (studentData_test.G3 + studentData_test.G2 + studentData_test.G1) / 60
for i in range(len(pass_exam_test)):
  if(sumGi_test[i] >= 0.6):
    pass_exam_test[i] = 1
  else:
    pass_exam_test[i] = 0

del studentData_test['school']
del studentData_test['address']
del studentData_test['G1']
del studentData_test['G2']
del studentData_test['G3']
studentData_test['Pass'] = pd.Series(pass_exam_test)

studentData_test

Unnamed: 0,sex,age,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,Pass
0,0,18,0,0,4,4,0,4,0,1,2,2,0,1,0,0,0,1,1,0,0,4,3,4,1,1,3,6,0
1,0,17,0,1,1,1,0,2,0,0,1,2,0,0,1,0,0,0,1,1,0,5,3,3,1,1,3,4,0
2,0,15,1,1,1,1,0,2,2,1,1,2,3,1,0,1,0,1,1,1,0,4,3,2,2,3,3,10,0
3,0,15,0,1,4,2,1,3,1,1,1,3,0,0,1,1,1,1,1,1,1,3,2,2,1,1,5,2,1
4,0,16,0,1,3,3,2,2,1,0,1,2,0,0,1,1,0,1,1,0,0,4,3,2,1,2,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,20,1,0,2,2,3,3,0,2,1,2,2,0,1,1,0,1,1,0,0,5,5,4,4,5,4,11,0
391,1,17,1,1,3,1,3,3,0,1,2,1,0,0,0,0,0,0,1,1,0,2,4,5,3,4,2,3,1
392,1,21,0,1,1,1,2,2,0,2,1,1,3,0,0,0,0,0,1,0,0,5,5,3,3,3,3,3,0
393,1,18,1,1,3,2,3,2,0,1,3,1,0,0,0,0,0,0,1,1,0,4,4,1,3,4,5,0,0


In [0]:
# Обучим модель
# Train the model

train_labels = studentData_train[studentData_train.columns[-1]].values
train_feature_matrix = studentData_train[studentData_train.columns[:-1]].values
test_labels = studentData_test[studentData_test.columns[-1]].values
test_feature_matrix = studentData_test[studentData_test.columns[:-1]].values

In [8]:
clf_tree_best = DecisionTreeClassifier(criterion='entropy', random_state=17, 
                                       max_depth=12, max_features=24)
clf_tree_best.fit(train_feature_matrix, train_labels)
y_pred = clf_tree_best.predict(test_feature_matrix)
accuracy_score(test_labels, y_pred)

0.6911392405063291

In [9]:
clf_tree_best = DecisionTreeClassifier(criterion='entropy', random_state=17, 
                                       max_depth=12, max_features=24)
tree_cv_score = np.mean(cross_val_score(clf_tree_best, train_feature_matrix, train_labels, cv=5))
clf_tree_best.fit(train_feature_matrix, train_labels)
y_pred_tree = clf_tree_best.predict(test_feature_matrix)
tree_holdout_score = accuracy_score(test_labels, y_pred_tree)
print('Decision tree. CV: {}, holdout: {}'.format(tree_cv_score, tree_holdout_score))

Decision tree. CV: 0.6147883124627311, holdout: 0.6911392405063291


In [10]:
rnd_forest = RandomForestClassifier(n_estimators=20,
                               criterion= 'entropy',
                               bootstrap = True,
                               max_features = 20,
                               max_depth=12)

rnd_forest.fit(train_feature_matrix, train_labels)
y_pred_forest = rnd_forest.predict(test_feature_matrix)
accuracy_score(test_labels, y_pred_forest)

0.6911392405063291

In [0]:
rnd_forest_search = RandomForestClassifier(criterion= 'entropy', bootstrap=True)

param_grid = {
    'n_estimators': np.arange(1, 101, 5),
    'max_features': np.arange(1, 24),
    'max_depth': np.arange(1, 20),
}

search = GridSearchCV(rnd_forest_search, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')

search.fit(train_feature_matrix, train_labels)

print(search.best_params_)