In [194]:
import os
import shutil

import pandas as pd
import pydotplus
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.preprocessing import OneHotEncoder

In [195]:
IMAGES_DIR = 'images'
shutil.rmtree(IMAGES_DIR, ignore_errors=True)
os.mkdir(IMAGES_DIR)

In [196]:
MARKS_DIR = 'marks'
MARKS_INFO = os.path.join(MARKS_DIR, 'marks-info.txt')
MARKS_DATA = os.path.join(MARKS_DIR, 'marks.csv')

In [197]:
with open(MARKS_INFO, 'r', encoding='cp1251') as f:
    info = f.read()
print(info)

Файл служит источником данных для решения задачи классификации путем построения дерева решений. Файл содержит анонимные данные об оценках школьников за письменную контрольную работу в реальной школе. Каждая строка содержит перечисленные через запятую следующие данные: 
пол ученика (PUPIL_SEX); 
класс ученика (PUPIL_CLASS); 
процент заданий контрольной работы, оцененных учителем как правильно выполненные (TEACHER_RIGHT); 
количество символов "птичка", проставленных учителем (TEACHER_CHK);
количество символов вопроса, проставленных учителем (TEACHER_QUEST);
количество исправлений, сделанных учителем (TEACHER_CORR);
количество исправлений, сделанных учеником (PUPIL_CORR);
количество фактов использования учеником штриха-замазки (PUPIL_STRIP);
итоговая оценка, выставленная учителем (FINALMARK).
Файл содержит 72 строки.


In [69]:
marks_data = pd.read_csv(MARKS_DATA)

In [107]:
marks_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
PUPIL_SEX,F,F,F,M,M,M,F,F,F,F,...,F,F,F,F,F,F,F,F,F,F
PUPIL_CLASS,8A,8A,8A,8A,8A,8A,8A,8A,8A,8A,...,8A,8A,8A,8A,8A,8A,8A,8A,8A,8A
TEACHER_RIGHT,65,70,85,55,40,65,67,60,100,100,...,90,95,50,60,70,80,85,90,95,55
TEACHER_CHK,0,4,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
TEACHER_QUEST,4,0,0,0,2,3,0,2,0,0,...,0,1,2,0,0,0,0,0,0,0
TEACHER_CORR,2,4,4,1,0,4,1,0,2,2,...,0,2,6,4,2,2,1,0,3,0
PUPIL_CORR,1,0,3,8,3,4,3,5,0,3,...,1,1,0,5,1,1,2,1,1,3
PUPIL_STRIP,6,4,5,3,4,8,3,9,2,1,...,0,0,5,0,4,7,2,1,2,3
FINALMARK,4-,3,4,3,2,4-,4-,3,5-,4,...,4,5-,3,3,4-,4-,4,4,5-,3


In [117]:
marks_train, marks_test = marks_data.drop(['FINALMARK'], axis=1), marks_data['FINALMARK']

In [118]:
CAT_FEATURES = ['PUPIL_SEX', 'PUPIL_CLASS']

In [119]:
num_features = marks_train.drop(CAT_FEATURES, axis=1)

In [120]:
FEATURES = list(num_features.columns.values)
FEATURES

['TEACHER_RIGHT',
 'TEACHER_CHK',
 'TEACHER_QUEST',
 'TEACHER_CORR',
 'PUPIL_CORR',
 'PUPIL_STRIP']

In [121]:
marks_encoder = OneHotEncoder(handle_unknown='ignore')

In [122]:
cat_features = marks_encoder.fit_transform(marks_train[CAT_FEATURES])

In [125]:
num_features.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
TEACHER_RIGHT,65,70,85,55,40,65,67,60,100,100,...,90,95,50,60,70,80,85,90,95,55
TEACHER_CHK,0,4,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
TEACHER_QUEST,4,0,0,0,2,3,0,2,0,0,...,0,1,2,0,0,0,0,0,0,0
TEACHER_CORR,2,4,4,1,0,4,1,0,2,2,...,0,2,6,4,2,2,1,0,3,0
PUPIL_CORR,1,0,3,8,3,4,3,5,0,3,...,1,1,0,5,1,1,2,1,1,3
PUPIL_STRIP,6,4,5,3,4,8,3,9,2,1,...,0,0,5,0,4,7,2,1,2,3


In [128]:
cat_features = cat_features.toarray()

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [131]:
one_hot_features = {name: features for name, features in zip(marks_encoder.get_feature_names(), cat_features.T)}

In [133]:
one_hot_data = pd.DataFrame(one_hot_features)
one_hot_data.head()

Unnamed: 0,x0_F,x0_M,x1_8A,x1_8B
0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0
4,0.0,1.0,1.0,0.0


In [138]:
mark_features = pd.concat([num_features, one_hot_data], axis=1, sort=False)
mark_features.head()

Unnamed: 0,TEACHER_RIGHT,TEACHER_CHK,TEACHER_QUEST,TEACHER_CORR,PUPIL_CORR,PUPIL_STRIP,x0_F,x0_M,x1_8A,x1_8B
0,65,0,4,2,1,6,1.0,0.0,1.0,0.0
1,70,4,0,4,0,4,1.0,0.0,1.0,0.0
2,85,0,0,4,3,5,1.0,0.0,1.0,0.0
3,55,0,0,1,8,3,0.0,1.0,1.0,0.0
4,40,1,2,0,3,4,0.0,1.0,1.0,0.0


In [139]:
feature_names = list(mark_features.columns.values)

**Разные варианты глубины**

In [188]:
clf_marks_tree = DecisionTreeClassifier(
    criterion='entropy', 
#     max_depth=3, 
    random_state=17,
)

In [189]:
clf_marks_tree.fit(mark_features, marks_test)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [190]:
def tree_graph_to_png(tree, feature_names, png_file_to_save, class_names=None):
    tree_str = export_graphviz(tree, feature_names=feature_names, 
                                       class_names=class_names,
                                     filled=True, out_file=None)
    graph = pydotplus.graph_from_dot_data(tree_str)  
    graph.write_png(png_file_to_save)

In [191]:
marks_image = os.path.join(IMAGES_DIR, 'marks.png')

In [192]:
tree_graph_to_png(tree=clf_marks_tree, 
                  feature_names=feature_names, 
                  class_names=clf_marks_tree.classes_, 
                  png_file_to_save=marks_image)

In [193]:
marks_image

'images/marks.png'

![](images/marks.png)