In [326]:
from __future__ import division
import os
import re
import time

import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile

import numpy as np
import pandas as pd

import cPickle as pickle

import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.svm import SVC, LinearSVC

from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
%matplotlib inline

In [205]:
import src.data_pipeline as dpl
import src.fsm2 as fsm
import src.predict_one as p_1

In [217]:
def create_SVC():
#     svm = LinearSVC(C=1.0, loss='squared_hinge', penalty='l2',multi_class='ovr')
    svm = SVC(C=1.0, kernel='linear', degree=3, gamma='auto', coef0=0.0, shrinking=True,
              probability=True, tol=0.001, cache_size=200, class_weight=None,
              verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)

    return svm

# Create Model
def create_RF():

    rf = RF(n_estimators=60, criterion='gini', max_depth=200, 
          min_samples_split=2, min_samples_leaf=1, 
          min_weight_fraction_leaf=0.0, max_features="auto", 
          max_leaf_nodes=None, min_impurity_split=1e-07, 
          bootstrap=True, oob_score=False, n_jobs=2, 
          random_state=None, verbose=0, warm_start=False, 
          class_weight=None)
    
    return rf

def run_fit(model, X, y):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(F,L, test_size=0.2, random_state=42)
    model = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    return model, X_test, y_test, y_pred, y_prob

def save_model(model, path_name):
    with open(path_name, 'wb') as handle:
        pickle.dump(model, handle)

# eval tools

In [245]:
def eval_model(model, X_test, y_test):
    # Modify for web app (or create new function)
    # to predict one photo
    def crossval(score_type):
        try: score = cross_val_score(model, X_test, y_test,
                            cv = 5, scoring= score_type)
        except: score = 'invalid metric'

        return score
    
    print "accuracy : ", crossval('accuracy')
    print "precision: ", crossval('precision')
    print "recall   : ", crossval('recall')
    print "- logloss: ", crossval('neg_log_loss')

In [194]:
F,L,paths = fsm.prep_data('second_sample_data')

dropping "hare" for balance


# SVC model

In [195]:
svm = create_SVC()
model, X_test, y_test, y_pred, y_prob = run_fit(svm,F,L)

# Manually put labels in order
alph_val_count = np.array([[20, 12, 19, 20, 37]])
percent_matrix = confusion_matrix(y_test,y_pred)*100/alph_val_count.T

print "Total value counts: \n",L.value_counts()
print "\nTest set value counts:\n",y_test.value_counts()
print '\nTest Confusion Matrix:\n',confusion_matrix(y_test,y_pred)
print '\nPercentage (sort of)\n', percent_matrix
print '\n'

Total value counts: 
ungulate    193
small       104
other       101
canine       95
feline       43
Name: labels, dtype: int64

Test set value counts:
ungulate    37
canine      20
small       20
other       19
feline      12
Name: labels, dtype: int64

Test Confusion Matrix:
[[20  0  0  0  0]
 [ 0 11  0  1  0]
 [ 0  0 18  0  1]
 [ 1  0  0 19  0]
 [ 0  0  0  0 37]]

Percentage (sort of)
[[ 100.            0.            0.            0.            0.        ]
 [   0.           91.66666667    0.            8.33333333    0.        ]
 [   0.            0.           94.73684211    0.            5.26315789]
 [   5.            0.            0.           95.            0.        ]
 [   0.            0.            0.            0.          100.        ]]




In [244]:
save_model(svm, 'data/svm.pkl')

In [247]:
eval_model(svm, X_test, y_test)

accuracy :  [ 0.91304348  0.91304348  0.95238095  0.95238095  0.9       ]
precision:  invalid metric
recall   :  invalid metric
- logloss:  [-0.3787518  -0.41039874 -0.39754279 -0.35578358 -0.45175097]


# RF model

In [212]:
rf = create_RF()
rf, X_test, y_test, y_pred, y_prob = run_fit(rf,F,L)

# Manually put labels in order
alph_val_count = np.array([[20, 12, 19, 20, 37]])
percent_matrix = confusion_matrix(y_test,y_pred)*100/alph_val_count.T

print "Total value counts: \n",L.value_counts()
print "\nTest set value counts:\n",y_test.value_counts()
print '\nTest Confusion Matrix:\n',confusion_matrix(y_test,y_pred)
print '\nPercentage (sort of)\n', percent_matrix
print '\n'

Total value counts: 
ungulate    193
small       104
other       101
canine       95
feline       43
Name: labels, dtype: int64

Test set value counts:
ungulate    37
canine      20
small       20
other       19
feline      12
Name: labels, dtype: int64

Test Confusion Matrix:
[[19  0  0  0  1]
 [ 1 11  0  0  0]
 [ 0  0 13  0  6]
 [ 5  0  1 13  1]
 [ 0  0  0  0 37]]

Percentage (sort of)
[[  95.            0.            0.            0.            5.        ]
 [   8.33333333   91.66666667    0.            0.            0.        ]
 [   0.            0.           68.42105263    0.           31.57894737]
 [  25.            0.            5.           65.            5.        ]
 [   0.            0.            0.            0.          100.        ]]




In [248]:
eval_model(rf, X_test, y_test)

accuracy :  [ 0.82608696  0.7826087   0.9047619   0.85714286  0.85      ]
precision:  invalid metric
recall   :  invalid metric
- logloss:  [-0.72018617 -0.80430246 -0.74306494 -0.66075032 -0.75257739]
