In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')

In [3]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,FEMALE
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,MALE
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,


In [4]:
df.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
species              344 non-null object
island               344 non-null object
bill_length_mm       342 non-null float64
bill_depth_mm        342 non-null float64
flipper_length_mm    342 non-null float64
body_mass_g          342 non-null float64
sex                  333 non-null object
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [6]:
# Есть смысл выкинуть пустые строки, тк их количество ~ 4 % от всего набора данных

In [7]:
df.dropna(inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 7 columns):
species              333 non-null object
island               333 non-null object
bill_length_mm       333 non-null float64
bill_depth_mm        333 non-null float64
flipper_length_mm    333 non-null float64
body_mass_g          333 non-null float64
sex                  333 non-null object
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


In [9]:
# получаем данные без пустых строк, поля со строками разобъем с помощью get_dummies

In [10]:
df = pd.concat([df, pd.get_dummies(df['island'])], axis=1)

In [11]:
df.drop('island', axis=1, inplace=True)

In [12]:
df = pd.concat([df, pd.get_dummies(df['sex'])], axis=1)

In [13]:
df.drop('sex', axis=1, inplace=True)

In [14]:
# Вытащим столбец с классами species

In [15]:
y = df.pop('species')
X = df

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 9 columns):
bill_length_mm       333 non-null float64
bill_depth_mm        333 non-null float64
flipper_length_mm    333 non-null float64
body_mass_g          333 non-null float64
Biscoe               333 non-null uint8
Dream                333 non-null uint8
Torgersen            333 non-null uint8
FEMALE               333 non-null uint8
MALE                 333 non-null uint8
dtypes: float64(4), uint8(5)
memory usage: 14.6 KB


In [17]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [18]:
dtc = DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [19]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [20]:
# посмотрим на значимость параметров в нашем дереве

In [21]:
dtc.feature_importances_

array([0.35909883, 0.        , 0.51984089, 0.00231304, 0.        ,
       0.10254656, 0.        , 0.01620068, 0.        ])

In [22]:
y_pred = dtc.predict(X_test)

In [23]:
print("Количество неверно классифицированных объектов : %d из %d"
    % ((y_test != y_pred).sum(), X_test.shape[0]))

Количество неверно классифицированных объектов : 5 из 110


In [24]:
dtc.score(X_test, y_test)

0.9545454545454546

In [25]:
dtc.score(X_train, y_train)

1.0

In [26]:
np.mean(cross_val_score(dtc, X, y, cv=5))

0.9757081724245904

In [27]:
# будем перебирать кол-во элементов для ветвления и листа, и макс глубину дерева

In [28]:

dtc_params = {'min_samples_split' : [2,3,4,5,10,20],
              'min_samples_leaf' : [1,2,3,4,5,10,20],
              'max_depth' : [None,3,5,10,20]}

In [29]:
dtc_grid = GridSearchCV(dtc, dtc_params, cv=5)

In [30]:
dtc_grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4, 5, 10, 20], 'min_samples_leaf': [1, 2, 3, 4, 5, 10, 20], 'max_depth': [None, 3, 5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [31]:
print ('Значение гиперпараметра наилучшего классификатора: ', dtc_grid.best_params_,
       "\nКоличество неверно классифицированных объектов : %d из %d" % ((y != dtc_grid.best_estimator_.predict(X)).sum(), X.shape[0]),
       '\nАккуратность классификатора: ', dtc_grid.best_score_)

Значение гиперпараметра наилучшего классификатора:  {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2} 
Количество неверно классифицированных объектов : 0 из 333 
Аккуратность классификатора:  0.984984984984985


In [32]:
# Как мы видим наша модель ни разу не ошибласть и R^2 почти равно единице 0.985