In [11]:
from __future__ import division
import os
import pandas as pd
import numpy as np

import graphlab as gl

from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import cross_val_score

import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

In [2]:
import src.metadata_handler as mdh
import src.clean_db as cdb
import src.data_pipeline as dpl

%autoreload 2

### play with graphlab

graphlab example of feature extraction:
```python
>>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k')
>>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
>>> m = graphlab.neuralnet_classifier.create(data,
...                                          target='label',
...                                          network=net,
...                                          max_iterations=3)
>>> # Now, let's extract features from the last layer
>>> data['features'] = m.extract_features(data)
>>> # Now, let's build a new classifier on top of extracted features
>>> m = graphlab.classifier.create(data,
...                                          features = ['features'],
...                                          target='label')
```
Now, let’s see how to load the ImageNet model, and use it for extracting features after resizing the data:
```python

>>> imagenet_model = graphlab.load_model('https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45')
>>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3, decode=True)
>>> data['imagenet_features'] = imagenet_model.extract_features(data)

```


In [114]:
data2 = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k')

image_col = data2['image']

data2.dtype

<bound method SFrame.dtype of Columns:
	label	int
	image	Image

Rows: 6000

Data:
+-------+----------------------+
| label |        image         |
+-------+----------------------+
|   5   | Height: 28 Width: 28 |
|   8   | Height: 28 Width: 28 |
|   1   | Height: 28 Width: 28 |
|   4   | Height: 28 Width: 28 |
|   2   | Height: 28 Width: 28 |
|   7   | Height: 28 Width: 28 |
|   0   | Height: 28 Width: 28 |
|   2   | Height: 28 Width: 28 |
|   5   | Height: 28 Width: 28 |
|   9   | Height: 28 Width: 28 |
+-------+----------------------+
[6000 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>

In [None]:
imagenet_model = graphlab.load_model('https://static.turi.com/products/graphlab-create\
                                        /resources/models/python2.7/imagenet_model_iter45')

data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3, decode=True)

data['imagenet_features'] = imagenet_model.extract_features(data)

## WITH LOCAL PHOTOS:

### create my dataframe

In [3]:
raw_df = dpl.create_dataframe('data/second_sample', model_name = 'testing')
# raw_df.head(3).T

0
0
0
0
0
0
0
0
0
10
10
10
10
10
10
10
10
10
10
20
20
20
20
20
20
20
20
20
20
30
30
30
30
30
30
30
30
30
30
40
40
40
40
40
40
40
40
40
40
50
50
50
50
50
50
50
50
50
50
60
60
60
60
60
60
60
60
60
60
70
70
70
70
70
70
70
70
70
70
80
80
80
80
80
80
80
80
80
80
90
90
90
90
90
90
90
90
90
90
100
100
100
100
100
100
100
100
100
100
110
110
110
110
110
110
110
110
110
110
120
120
120
120
120
120
120
120
120
120
130
130
130
130
130
130
130
130
130
130
140
140
140
140
140
140
140
140
140
140
150
150
150
150
150
150
150
150
150
150
160
160
160
160
160
160
160
160
160
160
170
170
170
170
170
170
170
170
170
170
180
180
180
180
180
180
180
180
180
180
190
190
190
190
190
190
190
190
190
190
200
200
200
200
200
200
200
200
200
200
210
210
210
210
210
210
210
210
210
210
220
220
220
220
220
220
220
220
220
220
230
230
230
230
230
230
230
230
230
230
240
240
240
240
240
240
240
240
240
240
250
250
250
250
250
250
250
250
250
250
260
260
260
260
260
260
260
260
260
260
270
270
270
270
270
270
270
270


In [4]:
df = raw_df.rename(columns = {'Unnamed: 0':'photo_id'})

# # add dummy cols to dataframe:
# df = pd.concat([df, df.keywords.str.join(sep = ' ').str.get_dummies(sep=',')], axis=1)

df.loc[pd.isnull(df.copyright_notice), 'copyright_notice'] = "not_provided"

# Drop unneeded columns
df = df.drop(['by_line', 'caption_abstract',
              'contact','object_name','sub_location',
              'supplemental_category','keywords'],
             axis = 1)


# bad_classifications = ["camera check", "marmot", "deer"]
# df = df.drop(bad_classifications, axis = 1)

# # take only rows where copyright is properly entered
# df = df.loc[df['copyright_notice'] == 'Conservation Northwest']

df.head(8).T

Unnamed: 0,0,1,2,3,4,5,6,7
copyright_notice,Conservation Northwest,Conservation Northwest,Conservation Northwest,Conservation Northwest,Conservation Northwest,Conservation Northwest,Conservation Northwest,Conservation Northwest
date_created,20120127,20120127,20120127,20120127,20120127,20120127,20120127,20120127
file_path,data/second_sample/BC Kettles/Site 1/April 201...,data/second_sample/BC Kettles/Site 1/April 201...,data/second_sample/BC Kettles/Site 1/April 201...,data/second_sample/BC Kettles/Site 1/April 201...,data/second_sample/BC Kettles/Site 1/April 201...,data/second_sample/BC Kettles/Site 1/April 201...,data/second_sample/BC Kettles/Site 1/April 201...,data/second_sample/BC Kettles/Site 1/April 201...
time_created,033159,033211,033212,033213,03:32:53800,03:32:54800,03:32:54800,03:33:05800
Camera Check,0,0,0,0,0,0,0,0
Canada lynx,1,1,1,1,1,1,1,1
Robin,0,0,0,0,0,0,0,0
Squirrel,0,0,0,0,0,0,0,0
Squirrel (unidentified),0,0,0,0,0,0,0,0
White-tailed deer,0,0,0,0,0,0,0,0


# turn dataframe into SFrame

In [6]:
data = gl.SFrame(df)

# GET IMAGES:
img_sframe = gl.image_analysis.load_images('data/second_sample', "auto", with_path=True,
                                                    recursive=True)


# ADD IMAGES TO data
data.add_column(img_sframe['image'], name='image')

# not needed for sample 2
# Filter for desired rows ('copyright_notice' == 'Conservation Northwest')
# data = data[data['copyright_notice'] == 'Conservation Northwest']


copyright_notice,date_created,file_path,time_created,Camera Check,Canada lynx,Robin
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,033159,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,033211,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,033212,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,033213,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,03:32:53800,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,03:32:54800,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,03:32:54800,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,03:33:05800,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,03:33:06800,0,1,0
Conservation Northwest,20120127,data/second_sample/BC Kettles/Site 1/April ...,03:33:07800,0,1,0

Squirrel,Squirrel (unidentified),White-tailed deer,Wolverine,bird,bobcat,chipmunk,cougar,coyote,domestic dog
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0

elk,mouse,mule deer,northern flying squirrel,red squirrel,snowshoe hare,striped skunk,unidentified
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0

image
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592
Height: 1944 Width: 2592


In [18]:
# Get Imagenet pretrained CNN
imagenet_model = gl.load_model('https://static.turi.com/products/graphlab-create\
                                            /resources/models/python2.7/imagenet_model_iter45')

In [19]:
def imgnet_features(data):
    # Convert photos to proper size for imagenet
    data['image'] = gl.image_analysis.resize(data['image'], 256, 256, 3, decode=True)

    # Get features from CNN
    data['imagenet_features'] = imagenet_model.extract_features(data)
    
    return data

In [54]:
# take smaller shard for faster itteration
data_subset, _ = data.random_split(0.6, seed=42)
data_subset = imgnet_features(data_subset)

# Create Model!

In [58]:
# Listing categories
print 'dataframe shape: ', df.shape, '\n'
for col in df.drop(['time_created','date_created','file_path','copyright_notice'], axis = 1).columns:
    print col, df[col].sum()

dataframe shape:  (1464, 25) 

Camera Check 10
Canada lynx 34
Robin 10
Squirrel 1
Squirrel (unidentified) 3
White-tailed deer 123
Wolverine 3
bird 6
bobcat 9
chipmunk 1
cougar 6
coyote 83
domestic dog 12
elk 34
mouse 61
mule deer 36
northern flying squirrel 5
red squirrel 17
snowshoe hare 928
striped skunk 9
unidentified 72


In [56]:
train_data, test_data = data_subset.random_split(0.8, seed=42)

predict_class = "snowshoe hare"

log_regr = LR(penalty='l2', dual=False, tol=0.0001, C=1.0,
           fit_intercept=True, intercept_scaling=1,
           class_weight='balanced', random_state=None, solver='liblinear',
           max_iter=100, multi_class='ovr', verbose=0,
           warm_start=False, n_jobs=2)

X = list(train_data['imagenet_features'])
y = list(train_data[predict_class])

X_test = list(test_data['imagenet_features'])
y_test = list(test_data[predict_class])

# fit model to data
model = log_regr.fit(X,y)
pred = model.predict(X_test)
prob = model.predict_proba(X_test)

print model.score(X_test, y_test)
print "accuracy : ", cross_val_score(model, X_test, y_test, scoring= 'accuracy')
print "precision: ", cross_val_score(model, X_test, y_test, scoring='precision')
print "recall   : ", cross_val_score(model, X_test, y_test, scoring='recall')
print "- logloss: ", cross_val_score(model, X_test, y_test, scoring='neg_log_loss')


# print'true | pred | prob_true'
# for a,b,c in zip(y_test, pred, prob):
#     print '  ',a,'   ',b,'   ',c[1]

0.51912568306
accuracy :  [ 0.54098361  0.55737705  0.54098361]
precision:  [ 0.61111111  0.63636364  0.63333333]
recall   :  [ 0.61111111  0.58333333  0.52777778]
- logloss:  [-1.43548735 -1.0946413  -1.57069994]


In [59]:
if 'sframe' in os.listdir('data/FirstStupidModel/'):
    print 'Yay!'
    
else:
    print "boo : ("

Yay!
