In [38]:
''
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, make_scorer, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, ShuffleSplit, learning_curve, validation_curve
from sklearn.feature_selection import RFE
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

import seaborn as sns
import pandas as pd
import numpy as np

# Dict of animal types
types_dict = {
    1: 'aardvark, antelope, bear, boar, buffalo, calf, cavy, cheetah, deer, dolphin, elephant, fruitbat, giraffe, girl, goat, gorilla, hamster, hare, leopard, lion, lynx, mink, mole, mongoose, opossum, oryx, platypus, polecat, pony, porpoise, puma, pussycat, raccoon, reindeer, seal, sealion, squirrel, vampire, vole, wallaby,wolf',
    2:'chicken, crow, dove, duck, flamingo, gull, hawk, kiwi, lark, ostrich, parakeet, penguin, pheasant, rhea, skimmer, skua, sparrow, swan, vulture, wren',
    3:'pitviper, seasnake, slowworm, tortoise, tuatara', 
    4:'bass, carp, catfish, chub, dogfish, haddock, herring, pike, piranha, seahorse, sole, stingray, tuna',
    5:'frog, frog, newt, toad',
    6:'flea, gnat, honeybee, housefly, ladybird, moth, termite, wasp', 
    7:'clam, crab, crayfish, lobster, octopus, scorpion, seawasp, slug, starfish, worm'
}


# Generate dataframe
data = pd.read_csv('zoo.data', sep=',', names=[
    'animal', 'hair',
    'feather', 'eggs',
    'milk', 'airborne',
    'aquatic', 'predator',
    'toothed', 'backbone',
    'breathes', 'venomous',
    'fins', 'legs',
    'tail', 'domestic',
    'catsize', 'type'
])

data.head()

Unnamed: 0,animal,hair,feather,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [42]:

# print(pd.get_dummies(data['animal'], prefix='type'))
# Preprocessing data
# df = data.drop(['animal'], axis=1)
df = data.drop(['type'], axis=1)

# stats by type of
#count_by_type = df['type'].value_counts()
#count_by_type.plot(x='type', y='freq', kind='bar', legend=False, grid=True, figsize=(8, 5))
#plt.title("Types of animals")
#plt.ylabel('# of Occurrences', fontsize=12)
#plt.xlabel('category', fontsize=12)


# X = df.drop(['type'], axis=1)
X = df.drop(['animal'], axis=1)
# y = df['type']
encoder = LabelEncoder()
y = encoder.fit_transform(df['animal'])
categories = df['animal'].

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
 99]


In [17]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0, test_size=0.2)


In [18]:
# Cross validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

C = np.arange(1, 10)

# # LEARNING CURVE SCORE
# # Create three different models based on max_depth
for k, C in enumerate(C):
    # Create a Decision tree regressor at max_depth = depth
    regressor = LogisticRegression(C=C)

    # Calculate the training and testing scores
    sizes, train_scores, test_scores = learning_curve(
        regressor, X, y, cv=cv, n_jobs=4,
        scoring=make_scorer(accuracy_score)
    )

    print('C:', C)
    print('score train:', np.mean(train_scores))
    print('score test:', np.mean(test_scores))
    

C: 1
score train: 0.994791666667
score test: 0.816255078608
C: 2
score train: 0.998958333333
score test: 0.827224871931
C: 3
score train: 1.0
score test: 0.827224871931
C: 4
score train: 1.0
score test: 0.827224871931
C: 5
score train: 1.0
score test: 0.827224871931
C: 6
score train: 1.0
score test: 0.827224871931
C: 7
score train: 1.0
score test: 0.827224871931
C: 8
score train: 1.0
score test: 0.827224871931
C: 9
score train: 1.0
score test: 0.827224871931


In [19]:
C = np.arange(1, 10)
regressor = LogisticRegression()
train_scores, test_scores = validation_curve(
    regressor, X, y, cv=cv, param_name='C', param_range=C,
    scoring=make_scorer(accuracy_score)
)

print('\n')
# For each depth
for x, k in enumerate(train_scores):
    print('C', x + 1)
    print('score train:', np.mean(train_scores[x]))
    print('score test:', np.mean(test_scores[x]))



C 1
score train: 0.974705039941
score test: 0.921338986045
C 2
score train: 0.995305164319
score test: 0.960272036743
C 3
score train: 1.0
score test: 0.960272036743
C 4
score train: 1.0
score test: 0.960272036743
C 5
score train: 1.0
score test: 0.960272036743
C 6
score train: 1.0
score test: 0.960272036743
C 7
score train: 1.0
score test: 0.960272036743
C 8
score train: 1.0
score test: 0.960272036743
C 9
score train: 1.0
score test: 0.960272036743


In [36]:
estimator = LogisticRegression()
grid = GridSearchCV(
    estimator, param_grid={'C': [1, 2, 4, 3, 5], 'penalty': ['l1', 'l2']},
    scoring='accuracy', cv=cv
)

grid.fit(X_train, y_train)
best_estimator = grid.best_estimator_
y_pred = best_estimator.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

print(types_dict[best_estimator.predict([[0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 2, 0, 0, 0]])[0]])
df.head()

Accuracy: 1.0
aardvark, antelope, bear, boar, buffalo, calf, cavy, cheetah, deer, dolphin, elephant, fruitbat, giraffe, girl, goat, gorilla, hamster, hare, leopard, lion, lynx, mink, mole, mongoose, opossum, oryx, platypus, polecat, pony, porpoise, puma, pussycat, raccoon, reindeer, seal, sealion, squirrel, vampire, vole, wallaby,wolf


Unnamed: 0,hair,feather,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
