# Loading Data
In this notebook, we will load various data from local and external

In [1]:
# We'll also import a few standard python libraries
from matplotlib import pyplot
import numpy as np
import time

# These are the droids you are looking for.
from caffe2.python import core, workspace
from caffe2.proto import caffe2_pb2

# Let's show all plots inline.
%matplotlib inline



## Load Fixed Width file
In this chapter, we will load a fixed width file in local path. The file contains weather station geographic information, we will extract (longitude, latitude, elevate) and plot them on panel.

In [2]:
import os
from StringIO import StringIO

data_folder = os.path.join(os.path.expanduser('~'), 'python', 'tutorial_files','weather')
#print("{}".format(data_folder))

raw_data = open(os.path.join(data_folder,'Station.txt'), 'r').read()
#print('Raw data looks like this:')
#print(raw_data[966:1300] + '...')

np.genfromtxt(\
    StringIO(raw_data),\
    delimiter=(7,6,30,3,3,3,6,7,8,10,9,9),# width of each field \
    skip_header=22, # skip 22 lines of header \
    usecols=(8, 7, 9), # reorder colums from (latitude, longitude, elevate) to (longitude, latitude, elevate) \
    missing_values="-99999,-999999", # missing_values will be replaced by numpy.nan \
    dtype=[('longitude','f8'),('latitude','f8'),('elevate','f8')], # \
    usemask=True) # required by using missing_values


masked_array(data = [(10350.0, 46817.0, 14200.0) (--, --, --) (--, --, --) ...,
 (-96565.0, 40848.0, 3624.0) (-96854.0, 40695.0, 4182.0) (--, --, --)],
             mask = [(False, False, False) ( True,  True,  True) ( True,  True,  True) ...,
 (False, False, False) (False, False, False) ( True,  True,  True)],
       fill_value = (  1.00000000e+20,   1.00000000e+20,   1.00000000e+20),
            dtype = [('longitude', '<f8'), ('latitude', '<f8'), ('elevate', '<f8')])

## Load Delimitered CSV file
In this chapter we will load a comma-delimitered csv file in local path. The file contains POS transactions.

In [3]:
import os
from StringIO import StringIO

data_folder = os.path.join(os.path.expanduser('~'), 'python', 'tutorial_files','transaction')
#print("{}".format(data_folder))

raw_data = open(os.path.join(data_folder,'Excel-20170511104221.csv'), 'r').read()
#print('Raw data looks like this:')
#print(raw_data[:1138] + '...')

np.genfromtxt(\
    StringIO(raw_data),\
    delimiter=",", # delimetered by comma \
    skip_header=1, # line 1 contains the header \
    usecols=(5, 6), # extract (trans_time, trans_amount) \
    autostrip=True, # remove TAB characters from data \
    dtype=[('trans_time','S19'),('trans_amount','f8')]) 

array([('2017/05/10 18:00:01',   5.50000000e+02),
       ('2017/05/10 18:00:01',   3.10000000e+04),
       ('2017/05/10 18:00:01',   1.00000000e-02), ...,
       ('2017/05/10 17:30:00',   1.00000000e+04),
       ('2017/05/10 17:30:00',   0.00000000e+00),
       ('2017/05/10 17:30:00',   1.24800000e+03)],
      dtype=[('trans_time', 'S19'), ('trans_amount', '<f8')])

## Load Delimitered CSV file (eyesight)


In [4]:
import os
from StringIO import StringIO

data_folder = os.path.join(os.path.expanduser('~'), 'python', 'tutorial_files','eyesight')
#print("{}".format(data_folder))

raw_data = open(os.path.join(data_folder,'eyesight.csv'), 'r').read()
#print('Raw data looks like this:')
#print(raw_data[:1138] + '...')

np.genfromtxt(\
    StringIO(raw_data),\
    delimiter=",", # delimetered by comma \
    skip_header=1, # line 1 contains the header \
    dtype=[('Age','S15'),('Prescription','S13'),('Astigmatic','S4'),('Tear production rate','S8'),('Class','S12')]) 

array([('Young', 'Myope', 'No', 'Reduced', 'No lenses'),
       ('Young', 'Myope', 'No', 'Normal', 'Soft lenses'),
       ('Young', 'Myope', 'Yes', 'Reduced', 'No lenses'),
       ('Young', 'Myope', 'Yes', 'Normal', 'Hard lenses'),
       ('Young', 'Hypermetrope', 'No', 'Reduced', 'No lenses'),
       ('Young', 'Hypermetrope', 'No', 'Normal', 'Soft lenses'),
       ('Young', 'Hypermetrope', 'Yes', 'Reduced', 'No lenses'),
       ('Young', 'Hypermetrope', 'Yes', 'Normal', 'Hard lenses'),
       ('Pre-presbyopic', 'Myope', 'No', 'Reduced', 'No lenses'),
       ('Pre-presbyopic', 'Myope', 'No', 'Normal', 'Soft lenses'),
       ('Pre-presbyopic', 'Myope', 'Yes', 'Reduced', 'No lenses'),
       ('Pre-presbyopic', 'Myope', 'Yes', 'Normal', 'Hard lenses'),
       ('Pre-presbyopic', 'Hypermetrope', 'No', 'Reduced', 'No lenses'),
       ('Pre-presbyopic', 'Hypermetrope', 'No', 'Normal', 'Soft lenses'),
       ('Pre-presbyopic', 'Hypermetrope', 'Yes', 'Reduced', 'No lenses'),
       ('Pre-presby

## Load Data from Internet


In [5]:
import requests, requests.auth
remoteurl = 'https://use.davin.wang:8443/tree' #URL for testing
headers = {
    'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
}
data = {
    'param1':'value1'
}
auth = requests.auth.HTTPBasicAuth('user','pass')
r = requests.get(remoteurl, data=data, headers=headers, auth=auth, verify=False)

if r.status_code == 200 :
    #打印结果
    print(r.content[:200]+'...')
    #打印爬取网页的各类信息

<!DOCTYPE HTML>
<html>

<head>
    <meta charset="utf-8">

    <title>Jupyter Notebook</title>
    <link rel="shortcut icon" type="image/x-icon" href="/static/base/images/favicon.ico?v=97c6417ed01bdc0...




## Invoke RESTFul API

In [6]:
import requests, requests.auth, json
remoteurl = 'https://use.davin.wang:8443/tree' #URL for testing
headers = {
    'Content-Type':'application/json'
}
payload = {
    'param1':'value1'
}
auth = requests.auth.HTTPBasicAuth('user','pass')
r = requests.get(remoteurl, data=json.dumps(payload), headers=headers, auth=auth, verify=False)

if r.status_code == 200 :
    #打印结果
    print(r.content[:200]+'...')
    #打印爬取网页的各类信息

<!DOCTYPE HTML>
<html>

<head>
    <meta charset="utf-8">

    <title>Jupyter Notebook</title>
    <link rel="shortcut icon" type="image/x-icon" href="/static/base/images/favicon.ico?v=97c6417ed01bdc0...




# Create DataSet

In [7]:
# First let's import a few things needed.
%matplotlib inline
import urllib2 # for downloading the dataset from the web.
import numpy as np
from matplotlib import pyplot
from StringIO import StringIO
from caffe2.python import core, utils, workspace
from caffe2.proto import caffe2_pb2

## Load Data
Load data and separate `features` and `labels`. `features` contains array of `[longitude, latitude]` and `labels` contains array of `elevate`.

In [8]:
import os, datetime
from StringIO import StringIO

data_folder = os.path.join(os.path.expanduser('~'), 'python', 'tutorial_files','weather')
#print("{}".format(data_folder))

raw_data = open(os.path.join(data_folder,'Station.txt'), 'r').read()
#print('Raw data looks like this:')
#print(raw_data[966:1300] + '...')

stations = np.genfromtxt(\
    StringIO(raw_data),\
    delimiter=(7,6,30,3,3,3,6,7,8,10,9,9),# width of each field \
    skip_header=22, # skip 22 lines of header \
    usecols=(8, 7, 9), # reorder colums from (latitude, longitude, elevate) to (longitude, latitude, elevate) \
    missing_values="-99999,-999999", # missing_values will be replaced by numpy.nan \
    dtype=[('longitude','f8'),('latitude','f8'),('elevate','f8')], # \
    usemask=True) # required by using missing_values

data_set = [ stations[i] for i in range(len(stations)) if stations[i][0] and stations[i][1] and stations[i][2]]

features = np.array([ [t[0]/1000/180, t[1]/1000/90] for t in data_set ], dtype=np.float32)
labels = np.array([ t[2]/10/10000 for t in data_set ], dtype=np.float32)

## Prepare Train Set and Test Set
Firstly, we randomize the data set. Then split the data set into train set and test set.

In [9]:
random_index = np.random.permutation(len(data_set))
features = features[random_index]
labels = labels[random_index]

train_features = features[0:20000]
train_labels = labels[0:20000]
test_features = features[20000:21000]
test_labels = labels[20000:21000]

## Convert to DB form

In [10]:
# First, let's see how one can construct a TensorProtos protocol buffer from numpy arrays.
feature_and_label = caffe2_pb2.TensorProtos()
feature_and_label.protos.extend([
    utils.NumpyArrayToCaffe2Tensor(features[0]),
    utils.NumpyArrayToCaffe2Tensor(labels[0])])
print('This is what the tensor proto looks like for a feature and its label:')
print(str(feature_and_label))
print('This is the compact string that gets written into the db:')
print(feature_and_label.SerializeToString())

This is what the tensor proto looks like for a feature and its label:
protos {
  dims: 2
  data_type: FLOAT
  float_data: 0.650650024414
  float_data: 0.322033345699
}
protos {
  data_type: FLOAT
  float_data: 0.00490000005811
}

This is the compact string that gets written into the db:

 �&?��>
.��;


## Write Back

In [11]:
# Now, actually write the db.

def write_db(db_type, db_name, features, labels):
    db = core.C.create_db(db_type, db_name, core.C.Mode.write)
    transaction = db.new_transaction()
    for i in range(features.shape[0]):
        feature_and_label = caffe2_pb2.TensorProtos()
        feature_and_label.protos.extend([
            utils.NumpyArrayToCaffe2Tensor(features[i]),
            utils.NumpyArrayToCaffe2Tensor(labels[i])])
        transaction.put(
            'train_{:0=5}'.format(i),
            feature_and_label.SerializeToString())
    # Close the transaction, and then close the db.
    del transaction
    del db

current_folder = os.path.join(os.path.expanduser('~'), 'python')
data_folder = os.path.join(current_folder, 'tutorial_data', 'weather_station')
    
write_db("minidb", os.path.join(data_folder, 'weather-station-train-minidb'), train_features, train_labels)
write_db("minidb", os.path.join(data_folder, 'weather-station-test-minidb'), test_features, test_labels)

# Read from DB

In [12]:
net_proto = core.Net("example_reader")
dbreader = net_proto.CreateDB([], "dbreader", db=os.path.join(data_folder, 'weather-station-train-minidb'), db_type="minidb")
net_proto.TensorProtosDBInput([dbreader], ["X", "Y"], batch_size=100)

#print("The net looks like this:")
#print(str(net_proto.Proto()))

(BlobReference("X"), BlobReference("Y"))

In [13]:
workspace.CreateNet(net_proto)

True

In [14]:
# Let's run it to get batches of features.
workspace.RunNet(net_proto.Proto().name)
print("The first batch of feature is:")
print(workspace.FetchBlob("X")[0])
print("The first batch of label is:")
print(workspace.FetchBlob("Y")[0])

The first batch of feature is:
[ 0.31248334  0.28462222]
The first batch of label is:
0.001
