# Motivation
demo tpot based on [Siraj Raval's Genetic Algorithms challenge]](https://youtu.be/dSofAXnnFrY)

In [None]:
## clean data

In [None]:
%%bash 

wget https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data  
wget https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.names

for i in $(grep "Attribute information" magic04.names -A 12 | grep ":" | awk '{print $2}' | sed 's/://' | tail -n+2); 
do 
    printf $i,
done | sed 's/,$//' > header.txt

cat header.txt magic04.data > cleandata.csv

In [1]:
import pandas as pd

In [3]:
raw_df = pd.read_csv('cleandata.csv',sep=',')


In [4]:
raw_df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


We have 19k examples, with about 2/3 g and 1/3 h

In [14]:
raw_df['class'].value_counts()

g    12332
h     6688
Name: class, dtype: int64

# `demo.py`


In [15]:
from tpot import TPOTClassifier
from sklearn.cross_validation import train_test_split
import pandas as pd 
import numpy as np



In [16]:
#load the data
telescope=pd.read_csv('cleandata.csv')

In [17]:
#randomize the data
telescope_shuffle=telescope.iloc[np.random.permutation(len(telescope))]
tele=telescope_shuffle.reset_index(drop=True)

In [19]:
#Store 2 classes
tele['class']=tele['class'].map({'g':0, 'h':1})
tele_class = tele['class'].values

In [29]:
# save our data
tele.to_csv('cleandata.mapped.tsv',sep='\t')

In [20]:
#Split training, testing, and validation data
training_indices, validation_indices = training_indices, testing_indices = train_test_split(tele.index,stratify= tele_class, train_size=0.75, test_size=0.25)

This step takes hours to run on an 8 core Mac Book Pro. If we want more `generations`, it will take longer.

In [22]:
%%time
#Let Genetic Programming find best ML model and hyperparameters
tpot = TPOTClassifier(generations=5, verbosity=2)
tpot.fit(tele.drop('class', axis=1).loc[training_indices].values,
         tele.loc[training_indices, 'class'].values)



Optimization Progress:  34%|███▎      | 202/600 [38:28<50:02,  7.54s/pipeline]   

Generation 1 - Current best internal CV score: 0.878723701862


Optimization Progress:  50%|█████     | 302/600 [55:34<1:55:38, 23.28s/pipeline]

Generation 2 - Current best internal CV score: 0.878723701862


Optimization Progress:  67%|██████▋   | 402/600 [1:23:36<52:31, 15.92s/pipeline]  

Generation 3 - Current best internal CV score: 0.878723701862


Optimization Progress:  84%|████████▎ | 502/600 [2:06:14<43:59, 26.93s/pipeline]  

Generation 4 - Current best internal CV score: 0.878723701862


                                                                                    

Generation 5 - Current best internal CV score: 0.882509387374

Best pipeline: RandomForestClassifier(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), bootstrap=True, criterion=entropy, max_features=0.25, min_samples_leaf=3, min_samples_split=5, n_estimators=100)


TPOTClassifier(config_dict={'sklearn.ensemble.GradientBoostingClassifier': {'max_features': array([ 0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35,  0.4 ,  0.45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ]), 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'min_samples_... 0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}, 'sklearn.preprocessing.RobustScaler': {}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=100, periodic_checkpoint_folder=None,
        population_size=100, random_state=None, scoring=None,
        subsample=1.0, verbosity=2, warm_start=False)

This time tpot came up with a `RandomForestClassifier`

In [23]:
#Score the accuracy
tpot.score(tele.drop('class', axis=1).loc[validation_indices].values,
           tele.loc[validation_indices, 'class'].values)

0.87613038906414298

In [24]:
#Export the generated code
tpot.export('pipeline.py')

True

## Load the pipeline
`tpot.export` gives us the source code of pipelines

In [None]:
# %load pipeline.py
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('cleandata.mapped.tsv', sep='\t', dtype=np.float64)
features = tpot_data.drop('class', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'].values, random_state=42)

# Score on the training set was:0.882509387374
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.25, min_samples_leaf=3, min_samples_split=5, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
testing_results = exported_pipeline.predict(testing_features)
training_results = exported_pipeline.predict(training_features)


In [None]:
## Results (as confusion matrix)

In [43]:
from sklearn.metrics import confusion_matrix

### Train

In [50]:
confusion_matrix(y_true=training_target, y_pred=training_results)

array([[9239,   23],
       [ 117, 4886]])

### Test

In [51]:
confusion_matrix(y_true=testing_target, y_pred=testing_results)

array([[2894,  176],
       [ 387, 1298]])

# Other pipelines
* We can also look into pipelines from `tpot.pareto_front_fitted_pipelines_`. The name indicates that it is related to [pareto optimization](https://en.wikipedia.org/wiki/Multi-objective_optimization)

In [60]:
pipeline = tpot.pareto_front_fitted_pipelines_.values()[0] #exported_pipeline.fit(training_features, training_target)


In [63]:
pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('gradientboostingclassifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=9,
              max_features=0.95, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              m...     presort='auto', random_state=None, subsample=0.45, verbose=0,
              warm_start=False))])

In [65]:
testing_results = pipeline.predict(testing_features)
training_results = pipeline.predict(training_features)

### Train

In [66]:
confusion_matrix(y_true=training_target, y_pred=training_results)

array([[9166,   96],
       [ 374, 4629]])

### Test

In [68]:
confusion_matrix(y_true=testing_target, y_pred=testing_results)

array([[2871,  199],
       [ 385, 1300]])

Comparing the off diagonals we see that this `gradientboostingclassifier` pipeline has less optimal cross validation metrics