<a href="https://colab.research.google.com/github/elisim/DeepTIME-Datahack2019/blob/master/Eli_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2018 The TensorFlow Authors.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Orcam code

In [0]:
import os
import sys
import tarfile
import numpy as np

FSENCODING = sys.getfilesystemencoding()


def enumerate_paths(paths):
    # Extract sequences/videos/people from the frame-paths
    sequences = [os.path.dirname(p) for p in paths]
    videos = [os.path.dirname(s) for s in sequences]
    people = [os.path.dirname(c) for c in videos]

    # Enumerate the frames based on videos and people
    unique_videos, video_ids = np.unique(videos, return_inverse=True)
    unique_people, person_ids = np.unique(people, return_inverse=True)
    return person_ids, video_ids


def split_by(data, indices):
    # Split data based on a numpy array of sorted indices
    sections = np.where(np.diff(indices))[0] + 1
    split_data = np.split(data, sections)
    return split_data


def parse_tarinfo(buff):
    # Get a version-compatible tarinfo parser
    if not hasattr(parse_tarinfo, 'defaultargs'):
        # Determine version once on first call
        dummy_header = tarfile.TarInfo().tobuf()
        try:
            _ = tarfile.TarInfo.frombuf(dummy_header)
            parse_tarinfo.defaultargs = False
        except TypeError:
            parse_tarinfo.defaultargs = True
    if parse_tarinfo.defaultargs:
        # Python 3
        return tarfile.TarInfo.frombuf(buff, FSENCODING, 'surrogateescape')
    else:
        # Python 2
        return tarfile.TarInfo.frombuf(buff)
      
      
from __future__ import print_function

import os
import cv2
import tarfile
import numpy as np
import pickle as pkl


class Images(object):
    # A class for easy and fast reading of images packed in a tar file
    def __init__(self, path, index_path=None):
        self.path = path
        if index_path is None:
            # index file is the same as tar path but  .pkl
            index_path = path[:-3] + 'pkl'
        if not os.path.exists(index_path):
            print('Indexing tar file, this could take a few minutes...')
            self._tar_index = self._index_tar(path)
            print('done')
            # Save index file
            with open(index_path, 'wb') as fid:
                pkl.dump(self._tar_index, fid)
        else:
            with open(index_path, 'rb') as fid:
                self._tar_index = pkl.load(fid)
        self.index_path = index_path
        # Open the tar file
        self.fid = open(path, 'rb')
        # Get its size for later checking the indexing validity
        self.fid.seek(0, 2)
        self.tar_size = self.fid.tell()
        # save a sorted list of the tar file paths (keys)
        self.keys = sorted(self._tar_index.keys())

    @staticmethod
    def _index_tar(path):
        # Build a dictionary with the locations of all data points
        tar_index = {}
        with tarfile.TarFile(path, "r") as tar:
            for tarinfo in tar:
                if tarinfo.isfile():
                    offsets_and_size = (
                        tarinfo.offset, tarinfo.offset_data, tarinfo.size)
                    tar_index[tarinfo.name] = offsets_and_size
        return tar_index

    @staticmethod
    def _decode_image(buff):
        # Decode an image buffer from memory
        buff_array = np.asarray(bytearray(buff), dtype='uint8')
        image = cv2.imdecode(buff_array, cv2.IMREAD_UNCHANGED)
        return image

    def __len__(self):
        return len(self._tar_index)

    @property
    def paths(self):
        return self.keys

    def _getitem(self, item):
        # A private _getitem for better readability
        # If item is an index, replace with the path at that index
        if isinstance(item, int):
            item = self.keys[item]
        # Grab an image buffer based on its path and decode it
        offset, data_offset, size = self._tar_index[item]
        # Go to start of record
        self.fid.seek(offset)
        # Check indexing validty
        header_size = data_offset - offset  # should always be 512
        tarinfo = parse_tarinfo(self.fid.read(header_size))
        if tarinfo.path != item:
            raise tarfile.InvalidHeaderError
        buff = self.fid.read(size)
        image = self._decode_image(buff)[:, :, ::-1]
        return image

    def __getitem__(self, item):
        try:
            image = self._getitem(item)
        except (tarfile.InvalidHeaderError, tarfile.TruncatedHeaderError, tarfile.EmptyHeaderError):
            error_str = 'Index file "{}" does not match tarfile "{}". Remove the index file and try again.'
            raise IOError(error_str.format(self.index_path, self.path))

        return image

    def __enter__(self):
        return self

    def __exit__(self, type, value, tb):
        self.fid.close()


def compatible_load(path):
    # pickle loading compatible for pyton 2/3
    data = None
    with open(path, 'rb') as fid:
        try:
            data = pkl.load(fid)
        except UnicodeDecodeError:
            # Python 3 compatability
            fid.seek(0)
            data = pkl.load(fid, encoding='latin1')
    return data


def read_pose(pose_path):
    # Read the pose points from file
    data = compatible_load(pose_path)
    keypoints = data['keypoints']
    scores = data['scores']
    paths = data['paths']
    return paths, keypoints, scores


def read_signatures(sigs_path):
    # Read the imagenet signatures from file
    data = compatible_load(sigs_path)
    signatures = data['signatures']
    paths = data['paths']
    return paths, signatures

In [0]:
paths , sigs = read_signatures('./drive/My Drive/DataHack-Storage/signatures.pkl')

In [0]:
person_ids, video_ids = enumerate_paths(paths)

In [0]:
unique_person_ids = np.unique(person_ids)
unique_video_ids = np.unique(video_ids)

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
zipped = zip(person_ids , video_ids , sigs)

In [0]:
zipped= [x for x in zipped]

In [0]:
zipped_train, zipped_test = train_test_split(zipped, test_size=0.1, random_state=42)

In [0]:
sigs_train = np.array([x[2] for x in zipped_train])
sigs_test = np.array([x[2] for x in zipped_test])

In [0]:
id_train = np.array([x[0] for x in zipped_train])
id_test = np.array([x[0] for x in zipped_test])

In [0]:
print(sigs_test.shape)
print(sigs_train.shape)
print(id_train.shape)
print(id_test.shape)

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [0]:
from keras import regularizers
model = keras.Sequential([
    keras.layers.Dense(256 ,activation=tf.nn.relu , input_shape=(2048,)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(256, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.05)),
    keras.layers.Dense(101, activation=tf.nn.softmax)
])

In [0]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [0]:
from keras.models import load_model

In [0]:
model.fit(sigs_train, id_train, epochs=10)
# model.save('/content/drive/My Drive/ish_hayam.model')
# model = load_model('/content/drive/My Drive/ish_hayam.model')

In [0]:
test_loss, test_acc = model.evaluate(sigs_test, id_test)

print('Test accuracy:', test_acc)

It turns out, the accuracy on the test dataset is a little less than the accuracy on the training dataset. This gap between training accuracy and test accuracy is an example of *overfitting*. Overfitting is when a machine learning model performs worse on new data than on their training data.

## Make predictions

With the model trained, we can use it to make predictions about some images.

In [0]:
predictions = model.predict(sigs_test)

Here, the model has predicted the label for each image in the testing set. Let's take a look at the first prediction:

So the model is most confident that this image is an ankle boot, or `class_names[9]`. And we can check the test label to see this is correct:

In [0]:
paths_eva , sigs_eva = read_signatures('./drive/My Drive/DataHack-Storage/sig-test-new.pkl')

In [0]:
def enumerate_paths_eva(paths):
    # Extract sequences/videos/people from the frame-paths
    sequences = [os.path.dirname(p) for p in paths]
    return sequences

In [0]:
seqs_eva = enumerate_paths_eva(paths_eva)

In [0]:
len(list(set(seqs_eva)))

In [0]:
zipped_eva = [x for x in zip(seqs_eva , sigs_eva)]

In [0]:
zipped_eva

In [0]:
evaluations = model.predict(sigs_eva)

In [0]:
evaluations.shape

In [0]:
from collections import defaultdict

res = {}
for seq_name, ev in zip(seqs_eva,evaluations):
  if not seq_name in res:
    res[seq_name] = ev
  else:
    res[seq_name] = np.add(ev ,res[seq_name] )

In [0]:
len(res)

In [0]:
final_res= {}
for a,b in res.items():
  x = np.flip(np.argsort(b))
  top_5 = [int(i) for i in x[:5]]
  final_res[a] = top_5

In [0]:
submissions = [ final_res[x]  for x in  final_res]

In [0]:
try:
    from urllib.request import urlopen
    from urllib.request import Request
except ImportError:
    from urllib2 import urlopen
    from urllib2 import Request

import json
import numpy as np


def submit(name, submission):
    # Submit your result to the leaderboard
    jsonStr = json.dumps({'submitter': name, 'predictions': submission})
    data = jsonStr.encode('utf-8')
    req = Request('https://leaderboard.datahack.org.il/orcam/api',
                  headers={'Content-Type': 'application/json'},
                  data=data)
    resp = urlopen(req)
    print(json.load(resp))


In [0]:
submit('DeepTIME' , submissions)