# Wybór pojedynczch najlepszych cech dla Decision Tree Clasifier

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import randint as sp_randint
from sklearn import model_selection
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier


## Załadowanie danych

In [7]:
full_data_vector = pd.read_csv('../../learning_vectors/v05/version5-complete.csv', sep=',')

print(full_data_vector.shape)
#full_data_vector.head()


(6080, 62)


In [8]:
all_features = [
    # 'Match_id', 'League_id', 'Season', 'Stage', 'Date', 'H_team', 'A_team', 'Result', 
    'H_Speed', 'H_Pass', 'H_Shoot', 'H_Pressure', 'H_chPass', 'H_chCross', 'H_dAggr', 'H_dWidth',
    'A_Speed', 'A_Pass', 'A_Shoot', 'A_Pressure', 'A_chPass', 'A_chCross', 'A_dAggr', 'A_dWidth',
    'H_age', 'A_age', 'H_TMV', 'A_TMV',
    # 'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 
    'H_Form03', 'A_Form03', 'H_Form05', 'A_Form05',
    'H_MeanShots03', 'A_MeanShots03', 'H_MeanShots05', 'A_MeanShots05',
    'H_MeanShotsOnTarget03', 'A_MeanShotsOnTarget03', 'H_MeanShotsOnTarget05', 'A_MeanShotsOnTarget05',
    'H_MeanFullTimeGoals03', 'A_MeanFullTimeGoals03', 'H_MeanFullTimeGoals05', 'A_MeanFullTimeGoals05',
    'H_WeightedMeanShots03', 'A_WeightedMeanShots03', 'H_WeightedMeanShots05', 'A_WeightedMeanShots05',
    'H_WeightedMeanShotsOnTarget03', 'A_WeightedMeanShotsOnTarget03',
    'H_WeightedMeanShotsOnTarget05', 'A_WeightedMeanShotsOnTarget05',
    'H_WeightedMeanFullTimeGoals03', 'A_WeightedMeanFullTimeGoals03',
    'H_WeightedMeanFullTimeGoals05', 'A_WeightedMeanFullTimeGoals05'
]

print(full_data_vector[all_features].shape)
full_data_vector[all_features].head()


(6080, 48)


Unnamed: 0,H_Speed,H_Pass,H_Shoot,H_Pressure,H_chPass,H_chCross,H_dAggr,H_dWidth,A_Speed,A_Pass,...,H_WeightedMeanShots05,A_WeightedMeanShots05,H_WeightedMeanShotsOnTarget03,A_WeightedMeanShotsOnTarget03,H_WeightedMeanShotsOnTarget05,A_WeightedMeanShotsOnTarget05,H_WeightedMeanFullTimeGoals03,A_WeightedMeanFullTimeGoals03,H_WeightedMeanFullTimeGoals05,A_WeightedMeanFullTimeGoals05
0,66,30,35,30,30,45,40,50,65,40,...,19.617647,11.823529,15.578947,7.368421,11.558824,5.058824,2.842105,1.315789,2.0,0.735294
1,60,70,55,35,70,70,70,35,50,35,...,8.0,17.882353,3.473684,11.473684,4.235294,9.529412,0.0,1.421053,0.558824,1.764706
2,58,30,50,30,31,70,70,30,70,70,...,13.558824,17.264706,8.052632,9.052632,8.058824,9.176471,1.473684,1.368421,1.029412,1.117647
3,60,65,45,40,60,70,70,40,55,70,...,13.088235,13.911765,7.631579,10.263158,6.941176,7.764706,1.947368,3.368421,1.882353,2.058824
4,70,70,50,30,70,70,70,30,70,50,...,13.058824,10.735294,9.631579,7.157895,7.558824,6.382353,2.421053,3.263158,1.941176,2.411765


In [9]:
# Utility function to report best scores

def report(results, n_top=5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Rank {0}: Mean validation score: {1:.5f} (std: {2:.5f})"
                  .format(i,
                          results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Params: {0}".format(results['params'][candidate]))


## Szukanie wyników predykcji dla pojedynczych cech celem wyboru najlepszego zestawu.

In [10]:
paired_features = [
    ['H_Speed', 'A_Speed'],
    ['H_Pass', 'A_Pass'],
    ['A_Shoot', 'H_Shoot'],
    ['H_Pressure', 'A_Pressure'],
    ['H_chPass', 'A_chPass'],
    ['H_chCross', 'A_chCross'],
    ['H_dAggr', 'A_dAggr'],
    ['H_dWidth', 'A_dWidth'],
    ['H_age', 'A_age'],
    ['H_TMV', 'A_TMV'],
    ['H_Form03', 'A_Form03'],
    ['H_Form05', 'A_Form05'],
    ['H_MeanShots03', 'A_MeanShots03'],
    ['H_MeanShots05', 'A_MeanShots05'],
    ['H_MeanShotsOnTarget03', 'A_MeanShotsOnTarget03'],
    ['H_MeanShotsOnTarget05', 'A_MeanShotsOnTarget05'],
    ['H_MeanFullTimeGoals03', 'A_MeanFullTimeGoals03'],
    ['H_MeanFullTimeGoals05', 'A_MeanFullTimeGoals05'],
    ['H_WeightedMeanShots03', 'A_WeightedMeanShots03'],
    ['H_WeightedMeanShots05', 'A_WeightedMeanShots05'],
    ['H_WeightedMeanShotsOnTarget03', 'A_WeightedMeanShotsOnTarget03'],
    ['H_WeightedMeanShotsOnTarget05', 'A_WeightedMeanShotsOnTarget05'],
    ['H_WeightedMeanFullTimeGoals03', 'A_WeightedMeanFullTimeGoals03'],
    ['H_WeightedMeanFullTimeGoals05', 'A_WeightedMeanFullTimeGoals05']
]


In [12]:
classifier = DecisionTreeClassifier()

parameters_distribution = {"max_depth": sp_randint(3, 6),
              "min_samples_split": sp_randint(140, 300),
              #"min_samples_leaf": sp_randint(2, 20),
              "criterion": ["entropy", "gini"]}

iteration_number = 10
cross_validator = model_selection.KFold(n_splits=5, shuffle=True)

random_search = RandomizedSearchCV(classifier,
                                   param_distributions=parameters_distribution,
                                   n_iter=iteration_number,
                                   cv=cross_validator)

y = full_data_vector['Result']

for feature in paired_features:
    X = full_data_vector[feature]
    print("\nFeatures: {}, {}".format(feature[0], feature[1]))
    random_search.fit(X, y)
    print("{}".format(random_search.best_score_))




Features: H_Speed, A_Speed
0.466118421053

Features: H_Pass, A_Pass
0.486019736842

Features: A_Shoot, H_Shoot
0.477467105263

Features: H_Pressure, A_Pressure
0.483059210526

Features: H_chPass, A_chPass
0.475328947368

Features: H_chCross, A_chCross
0.477467105263

Features: H_dAggr, A_dAggr
0.48125

Features: H_dWidth, A_dWidth
0.480427631579

Features: H_age, A_age
0.464802631579

Features: H_TMV, A_TMV
0.534210526316

Features: H_Form03, A_Form03
0.476809210526

Features: H_Form05, A_Form05
0.498684210526

Features: H_MeanShots03, A_MeanShots03
0.521546052632

Features: H_MeanShots05, A_MeanShots05
0.515953947368

Features: H_MeanShotsOnTarget03, A_MeanShotsOnTarget03
0.510855263158

Features: H_MeanShotsOnTarget05, A_MeanShotsOnTarget05
0.516118421053

Features: H_MeanFullTimeGoals03, A_MeanFullTimeGoals03
0.493914473684

Features: H_MeanFullTimeGoals05, A_MeanFullTimeGoals05
0.500493421053

Features: H_WeightedMeanShots03, A_WeightedMeanShots03
0.519572368421

Features: H_Weigh