In [1]:
from __future__ import division
import pandas as pd
import numpy as np

import graphlab as gl

from sklearn.linear_model import LogisticRegression as LR

import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1479233152.log


This non-commercial license of GraphLab Create for academic use is assigned to evan.adkins42@gmail.com and will expire on October 13, 2017.


In [6]:
import src.metadata_handler as mdh
import src.clean_db as cdb
import src.data_pipeline as dpl

%autoreload 2

## WITH LOCAL PHOTOS:

### create my dataframe

In [28]:
raw_df = dpl.create_dataframe('first_sample', model_name = 'testing')
# raw_df.head(3).T

0
0
0
0
0
0
0
0
0
10
10
10
10
10
10
10
10
10
10
20
20
20
20
20
20
20
20
20
20
30
30
30
30
30
0 photos failed to process

data/testing/raw_metadata.json 
data/testing/metadata.csv 



In [34]:
df = raw_df.rename(columns = {'Unnamed: 0':'photo_id'})

# add dummy cols to dataframe:
df = pd.concat([df, df.keywords.str.join(sep = ' ').str.get_dummies(sep=',')], axis=1)

df.loc[pd.isnull(df.copyright_notice), 'copyright_notice'] = "not_provided"

# Drop unneeded columns
df = df.drop(['by_line', 'caption_abstract',
              'contact','object_name','sub_location',
              'supplemental_category','keywords'],
             axis = 1)

bad_classifications = ["camera check", "marmot", "deer"]
df = df.drop(bad_classifications, axis = 1)

# # take only rows where copyright is properly entered
# df = df.loc[df['copyright_notice'] == 'Conservation Northwest']

df.head(3).T

Unnamed: 0,0,1,2
copyright_notice,Conservation Northwest,not_provided,Conservation Northwest
date_created,2015-05-30,,2015-06-04
file_path,first_sample/EK000004-2.JPG,first_sample/EK000004.JPG,first_sample/EK000010-2.JPG
time_created,11:49:52,,14:54:40
Camera Check,1,0,0
hoary marmot,0,0,1
mule deer,0,0,0
unidentified,0,0,0


In [27]:
# add dummy cols to dataframe:
df = pd.concat([df, df.keywords.str.join(sep = ' ').str.get_dummies(sep=',')], axis=1)

df.keywords.str.join(sep = ' ').str.get_dummies(sep=',')

# df['label'].str.join(sep='*').str.get_dummies(sep='*')

AttributeError: 'DataFrame' object has no attribute 'keywords'

### play with graphlab

graphlab example of feature extraction:
```python
>>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k')
>>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
>>> m = graphlab.neuralnet_classifier.create(data,
...                                          target='label',
...                                          network=net,
...                                          max_iterations=3)
>>> # Now, let's extract features from the last layer
>>> data['features'] = m.extract_features(data)
>>> # Now, let's build a new classifier on top of extracted features
>>> m = graphlab.classifier.create(data,
...                                          features = ['features'],
...                                          target='label')
```
Now, let’s see how to load the ImageNet model, and use it for extracting features after resizing the data:
```python

>>> imagenet_model = graphlab.load_model('https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45')
>>> data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3, decode=True)
>>> data['imagenet_features'] = imagenet_model.extract_features(data)

```


In [114]:
data2 = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k')

image_col = data2['image']

data2.dtype

<bound method SFrame.dtype of Columns:
	label	int
	image	Image

Rows: 6000

Data:
+-------+----------------------+
| label |        image         |
+-------+----------------------+
|   5   | Height: 28 Width: 28 |
|   8   | Height: 28 Width: 28 |
|   1   | Height: 28 Width: 28 |
|   4   | Height: 28 Width: 28 |
|   2   | Height: 28 Width: 28 |
|   7   | Height: 28 Width: 28 |
|   0   | Height: 28 Width: 28 |
|   2   | Height: 28 Width: 28 |
|   5   | Height: 28 Width: 28 |
|   9   | Height: 28 Width: 28 |
+-------+----------------------+
[6000 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>

### turn dataframe into SFrame

In [83]:
data = gl.SFrame(df)

# GET IMAGES:
img_sframe = gl.image_analysis.load_images('first_sample', "auto", with_path=False,
                                                    recursive=True)

# ADD IMAGES TO data
data.add_column(img_sframe['image'], name='image')

# Filter for desired rows ('copyright_notice' == 'Conservation Northwest')
data = data[data['copyright_notice'] == 'Conservation Northwest']

In [85]:
# Get Imagenet pretrained CNN
imagenet_model = gl.load_model('https://static.turi.com/products/graphlab-create\
                                        /resources/models/python2.7/imagenet_model_iter45')
# Convert photos to proper size for imagenet
data['image'] = gl.image_analysis.resize(data['image'], 256, 256, 3, decode=True)

# Get features from CNN
data['imagenet_features'] = imagenet_model.extract_features(data)

# Create Model!

In [114]:
log_regr = LR(penalty='l2', dual=False, tol=0.0001, C=1.0,
           fit_intercept=True, intercept_scaling=1,
           class_weight=None, random_state=None, solver='liblinear',
           max_iter=100, multi_class='ovr', verbose=0,
           warm_start=False, n_jobs=2)

X = data['imagenet_features']
y = data["[u'mule deer']"]

# fit model to data
model = log_regr.fit(X,y)
pred = model.predict(X)
prob = model.predict_proba(X)

print'true | pred | prob [mule deer = true]'
for a,b,c in zip(y, pred, prob):
    print a,'  ',b,'  ',c[1]

print model.score(X,y)

true | pred | prob [mule deer = true]
0.0    0.0    0.00226344941121
0.0    0.0    0.0433000493441
0.0    0.0    0.00980105000864
0.0    0.0    0.477570801839
0.0    0.0    0.498634624133
0.0    0.0    0.0618667062038
1.0    1.0    0.946675543696
1.0    1.0    0.987679561889
1.0    0.0    0.498634624133
1.0    1.0    0.990925428388
0.0    0.0    0.00319358644227
0.0    0.0    0.00319358644227
1.0    1.0    0.501632769344
1.0    0.0    0.477570801839
1.0    1.0    0.990925428388
0.0    1.0    0.501632769344
0.0    0.0    0.00319358644227


ValueError: Expected array-like (array or non-string sequence), got dtype: float
Rows: 17
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0]