# Initialize Test, Train and Cross Validation

We need our list of images and corresponding metadata in a format which Caffe will understand. In order to do this we will separate out our set of images to be a test, train and cross validation set.

## References
* [Test, Train and Cross Validation](https://class.coursera.org/ml-005/lecture/61)

In [65]:
import ansible.runner
import ansible.inventory

import random
import csv

In [66]:
def generate_from_webapp_training(webapp_training_output_file="./categorization/output.tsv"):
    """
    Generate training information from an output file which was created using the `categorization` webapp.
    
    Parameters
    ----------
    webapp_training_output_file : str
        Filename of the TSV generated using the webapp. TSV's format is "zillow_id\turl\tcategory\n".
        
    Returns
    -------
    tuple : (dict, list)
        A tuple which includes a dictionary of categories with their corresponding index and a list of all the
        training filenames.
    """
    unique_tags = {}
    results = []

    # Caffe indexes syn_words based on 0 but the webapp starts at 1.
    i = 0

    with open(webapp_training_output_file, "r") as training_file:
        for row in csv.DictReader(training_file, delimiter="\t"):
            # Now we use the relative file URL
            filename = row["url"].split("/")[-1]
            category = row["category"]

            if unique_tags.get(category, None) is None:
                unique_tags[category] = i
                i += 1 

            results.append((filename, unique_tags[category],))

    # Randomize our list of train results (this will be randomized again by Caffe)
    randomized = [
        (random.random(), result) for result in results
    ]
    randomized.sort()
    
    # Remove the random value from the train result
    train = map(lambda r: r[1], randomized)

    return (unique_tags, train)

In [67]:
def write_training_files(unique_tags, train, computed_dir="./computed"):
    """
    Write the files Caffe expects {syn_words.txt, train.txt, val.txt} which are all space delimited files
    with no headers.
    
    Parameters
    ----------
    unique_tags : dict
        Each unique tag and their integer index.
    train : list(tuple(str,int,))
        List of all training files which are stored as a filename and the index of the category
        related to that file.
    computed_dir : str
        We will store all this information in a directory called "computed" so that we keep it
        separate from the actual models being generated and the original information.
        
    Returns
    -------
    tuple : (str,str,str,)
        The filenames which were created: (syn_words_filename, train_filename, val_filename,)
    """
    syn_words_filename = "{computed_dir}/syn_words.txt".format(computed_dir=computed_dir)
    train_filename = "{computed_dir}/train.txt".format(computed_dir=computed_dir)
    val_filename = "{computed_dir}/val.txt".format(computed_dir=computed_dir)
    
    open(syn_words_filename, "w").close()
    for k, v in unique_tags.iteritems():
        with open(syn_words_filename, "a") as t:
            t.write("%s %s\n" % (v, k))

    split = int(round(len(train) * 0.2))
    open(train_filename, "w").close()
    for line in train[split:]:
        with open(train_filename, "a") as t:
            t.write("%s %s\n" % line)

    open(val_filename, "w").close()
    for line in train[:split]:
        with open(val_filename, "a") as t:
            t.write("%s %s\n" % line)
    
    return (syn_words_filename, train_filename, val_filename,)

In [68]:
# Sync the computed directory to the caffe instance which will be doing this run.
def sync_computed(instance_ip_address, computed_dir):
    """
    Upload the computed directory to the instance we are about to run caffe on.
    
    Parameters
    ----------
    instance_ip_address : str
        The IP Address of the instance where Caffe will be ran.
    computed_dir : str
        Directory which keep all the computed files which were generated for unique tags,
        training and cross validation.
        
    Returns
    -------
    dict
        Output from the ansible runner for the given instance.
    """
    hosts = [instance_ip_address]

    inventory = ansible.inventory.Inventory(hosts)
    ansible_runner = ansible.runner.Runner(
        module_name="synchronize",
        module_args="src={computed_dir} dest=./".format(
            computed_dir=computed_dir),
        timeout=5,
        inventory=inventory,
        remote_user="ubuntu"
    )

    out = ansible_runner.run()
    if not out["contacted"].get(instance_ip_address, None):
        raise Exception("No response information from instance :/")
    
    return out["contacted"][instance_ip_address]

In [69]:
# Get the tags and training data then write them to local files.
unique_tags, train = generate_from_webapp_training(webapp_training_output_file="./categorization/output.tsv")

computed_data_dir = "./computed"
write_training_files(unique_tags, train, computed_dir=computed_data_dir)

instance_ip = "174.129.71.20"
sync_computed(instance_ip, computed_data_dir)
sync_computed(instance_ip, "./images")
sync_computed(instance_ip, "./prototxt")

{u'changed': False,
 u'cmd': u'rsync --delay-updates -F --compress --archive --rsh \'ssh  -S none -o StrictHostKeyChecking=no\' --out-format=\'<<CHANGED>>%i %n%L\' "./prototxt" "ubuntu@174.129.71.20:./"',
 'invocation': {'module_args': u'src=./prototxt dest=./',
  'module_complex_args': {},
  'module_name': 'synchronize'},
 u'msg': u'',
 u'rc': 0,
 u'stdout_lines': []}

In [70]:
unique_tags

{'Exterior': 0, 'Garden': 2, 'Interior': 1}

In [71]:
train[0:5]

[('ISxzdiqn5yox4i1000000000.jpg', 1),
 ('ISz0m8d8ev9jmr.jpg', 0),
 ('ISxz9nzak77pyh1000000000.jpg', 0),
 ('ISdsuaf1rml8zh1000000000.jpg', 1),
 ('IS5msywumedbzh1000000000.jpg', 1)]