In [1]:
import math

## Calculating Entropy
#### Entropy:  The negative of the sum of each probability times it's log (loge,log2 or log10 are most common)

In [2]:
#Calculating Entropy for a 50/50 probability
-(0.5*math.log(0.5,2)+0.5*math.log(0.5,2))

1.0

In [3]:
#Calculating Entropy for an 80/20 probability
-(0.8*math.log(0.8,2)+0.2*math.log(0.2,2))

0.7219280948873623

### Example

6 dogs, 8 cats in one root node (14 total)

In [4]:
print(round((6/14),5),round((8/14),5))

0.42857 0.57143


In [5]:
-(0.42857*math.log(0.42857,2)+0.57143*math.log(0.57143,2))

0.9852275431175269

#### Split on plays fetch:
* Yes: 4 dogs, 1 cat
* No: 2 dogs, 7 cats

'Yes' entropy:

In [6]:
-(0.8*math.log(0.8,2)+0.2*math.log(0.2,2))

0.7219280948873623

'No' entropy:

In [7]:
-(0.22222*math.log(0.22222,2)+0.77778*math.log(0.77778,2))

0.7642004901437391

#### Split on likes cat food:
* Yes: 0 dogs, 6 cats
* No: 6 dogs, 2 cats

'Yes' entropy:

In [8]:
-(1*math.log(1,2))

-0.0

In [9]:
-(0.75*math.log(0.75,2)+0.25*math.log(0.25,2))

0.8112781244591328

## Information Gain

Information gain is the difference between the entropy of the parent
node, and the weighted average of the children nodes' entropies.

Weighted Average:
The sum of each child nodes entropy times it's percentage of instances taken from the parent node

### Information Gain on 'plays fetch'

In [10]:
#Root node entropy
-(0.42857*math.log(0.42857,2)+0.57143*math.log(0.57143,2))

0.9852275431175269

In [11]:
#Weighted Average for 'plays fetch'
(0.7219280948873623*(5/14))+(0.7642004901437391*(9/14))

0.7491032061236047

In [12]:
#Information Gain
0.9852275431175269 - 0.7491032061236047

0.2361243369939222

### Information Gain on 'likes cat food'

In [13]:
#Root node entropy
-(0.42857*math.log(0.42857,2)+0.57143*math.log(0.57143,2))

0.9852275431175269

In [14]:
#Weighted Average for 'likes cat food'
(0*(6/14))+(0.8112781244591328*(8/14))

0.46358749969093305

In [15]:
#Information Gain
0.9852275431175269 - 0.46358749969093305

0.5216400434265939

#### 'Likes Cat Food' brings up a bigger information gain than 'Plays Fetch'

## Decision Trees with sklearn

In [16]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [17]:
df = pd.read_csv('data/ad.data', header=None,low_memory=False)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


In [18]:
explanatory_variable_columns = set(df.columns.values)
response_variable_column = df[len(df.columns.values)-1]
explanatory_variable_columns.remove(len(df.columns.values)-1)
y = [1 if e == 'ad.' else 0 for e in response_variable_column]
X = df[list(explanatory_variable_columns)].copy()

In [19]:
X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
pipeline = Pipeline([
 ('clf', DecisionTreeClassifier(criterion='entropy'))
 ])

In [22]:
parameters = {
 'clf__max_depth': (150, 155, 160),
 'clf__min_samples_split': (2, 3, 4),
 'clf__min_samples_leaf': (1, 2, 3)
 }

In [23]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)
print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   27.0s finished


Best score: 0.872
Best parameters set:
	clf__max_depth: 155
	clf__min_samples_leaf: 3
	clf__min_samples_split: 2
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       699
          1       0.94      0.84      0.89       121

avg / total       0.97      0.97      0.97       820



## Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
pipeline = Pipeline([
    ('clf', RandomForestClassifier(criterion='entropy'))
])
parameters = {
    'clf__n_estimators': (5, 10, 20, 50),
    'clf__max_depth': (50, 150, 250),
    'clf__min_samples_split': (2, 3, 4),
    'clf__min_samples_leaf': (1, 2, 3)
}

In [26]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)
print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.9min finished


Best score: 0.920
Best parameters set:
	clf__max_depth: 150
	clf__min_samples_leaf: 1
	clf__min_samples_split: 3
	clf__n_estimators: 50
             precision    recall  f1-score   support

          0       0.98      1.00      0.99       699
          1       0.99      0.90      0.94       121

avg / total       0.98      0.98      0.98       820

