Add relevant import statements.

In [31]:
from snorkel.learning import GenerativeModel
from scipy import sparse
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import shutil
from shutil import copyfile
import operator
import random

Package data-handling code into a class.

In [32]:
class DataLoader(object):
    def __init__(self, data_path):
    
        sub_dirs = [name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name))]
        self.files = [] # Array of filenames
        self.truths = {} # Dictionary holding a mapping from a filename to its class
        self.data_path = data_path

        for sub_dir in sub_dirs:
            # Extract all image files from the given directory
            sub_dir_path = data_path + sub_dir + "/images/"
            sub_dir_files = [sub_dir_path + f for f in os.listdir(sub_dir_path) if f[-3:] == 'jpg']
            self.files += sub_dir_files
            
            for sub_dir_file in os.listdir(sub_dir_path):
                if sub_dir_file in self.truths:
                    raise ValueError("File appearing twice in different directories")
                else:
                    self.truths[sub_dir_file] = sub_dir
        
        self.files.sort()     
        self.train_num = len(self.files)

    # Retrieve contents of a text file, given its index into the 'files' array
    def get_file_contents(self, i):
        
        f_name = self.get_text_filename(i)
        
        if not os.path.isfile(f_name):
            print("File not found: ", f_name)
            return ""
        
        file = open(f_name)
        contents = file.read()
        file.close()
        return contents
    
    # Retrieve the full filename of an image file, given its index into the 'files' array
    def get_img_filename(self, i):
        if i >= len(self.files):
            raise ValueError('Illegal file index')
            
        f_name = self.files[i]
        return f_name
    
    
    # Retrieve the full filename of a text file, given its index into the 'files' array
    def get_text_filename(self, i):
        img_filename = self.get_img_filename(i)
        parent_dir = os.path.basename(os.path.dirname(os.path.dirname((img_filename))))
        text_name = os.path.basename(img_filename)[:-3] + 'txt'
        return self.data_path + parent_dir + "/text/" + text_name 
    
    # Retrieve the index of a file, given its full name
    def find_fileindex(self, f_name):
        for i, file in enumerate(self.files):
            if os.path.basename(file) == f_name:
                return i
        return -1
    
    # Retrieve the hand-label of a file, given its index into the 'files' array
    def get_truth(self, i):
        f_name = self.get_img_filename(i)
        return self.truths[os.path.basename(f_name)]


Load training data into the data loader.

In [33]:
loader = DataLoader('path-to-parsed-data')

print("Num files loaded: ", loader.train_num)

Num files:  1795


Create keyword-based functions for text labelling.

In [34]:
def mentions_fire(text_content):
    keywords = ['fire', 'flame', 'smoke', 'burn', 'burning', 'ash', 'forest', 'gas', 'explosion']
    
    for keyword in keywords:
        if keyword in text_content:
            return 1
    return 0

def mentions_damaged_flood(text_content):
    keywords = ['flood', 'water', 'rain', 'disaster', 'damage',]
    
    for keyword in keywords:
        if keyword in text_content:
            return 2
    return 0

def mentions_human_damage(text_content):
    keywords = ['death', 'dead', 'victim', 'loss', 'kill', 'injured', 
                'injury', 'war', 'refugee', 'crime', 'attack', 'crisis', 'explosion']
    
    for keyword in keywords:
        if keyword in text_content:
            return 3
    return 0

def mentions_damaged_infrastructure(text_content):
    keywords = ['wreck', 'collapse', 'broken', 'bridge', 'building', 
                'storm', 'disaster', 'damage', 'destroy', 'destruction']
    
    for keyword in keywords:
        if keyword in text_content:
            return 4
    return 0

def mentions_damaged_nature(text_content):
    keywords = ['nature', 'animals', 'wildlife', 'habitat', 'storm', 'destruction',  
                'environment', 'tree', 'drought', 'landslide', 'forest', 'tornado']
    
    for keyword in keywords:
        if keyword in text_content:
            return 5
    return 0

def non_damage(text_content):
  
    bad_words = [
                'burn', 'wreck', 'collapse', 'disaster', 'damage', 'destroy', 'destruction', 
                'death', 'dead', 'victim', 'kill', 'injured', 
                'injury', 'war', 'refugee', 'attack', 'crisis',
                'storm', 'blizzard', 'drought', 'tornado', 'fire', 'landslide', 'earthquake', 'volcano',
                'eruption', 'flood', 'tsunami']

    for keyword in bad_words:
        if keyword in text_content:
            return 0
    return 6
    

Create a primitive matrix using these functions and files (as described in the Snorkel tutorials).

In [35]:
def create_primitives(loader):
    m = 6 # number of primitives
    primitive_mtx = np.zeros((loader.train_num,m))

    for i in range(loader.train_num):
        primitive_mtx[i,0] = mentions_fire(loader.get_file_contents(i))
        primitive_mtx[i,1] = mentions_damaged_flood(loader.get_file_contents(i))
        primitive_mtx[i,2] = mentions_human_damage(loader.get_file_contents(i))
        primitive_mtx[i,3] = mentions_damaged_infrastructure(loader.get_file_contents(i))
        primitive_mtx[i,4] = mentions_damaged_nature(loader.get_file_contents(i))
        primitive_mtx[i,5] = non_damage(loader.get_file_contents(i))
               
    return primitive_mtx

In [36]:
primitive_mtx = create_primitives(loader)

p_keys = {
    'has_fire': primitive_mtx[:,0],
    'has_flood': primitive_mtx[:,1],
    'has_human_damage': primitive_mtx[:,2],
    'has_damaged_infrastructure': primitive_mtx[:,3],
    'has_damaged_nature': primitive_mtx[:,4],
    'has_non_damage': primitive_mtx[:,5]
   }

Create the text-labelling functions.

In [37]:
def LF_has_fire(has_fire):
    return has_fire

def LF_has_flood(has_flood):
    return has_flood

def LF_has_human_damage(has_human_damage):
    return has_human_damage

def LF_has_damaged_infrastructure(has_damaged_infrastructure):
    return has_damaged_infrastructure

def LF_has_damaged_nature(has_damaged_nature):
    return has_damaged_nature

def LF_has_non_damage(has_non_damage):
    return has_non_damage

In [38]:
L_fns = [LF_has_fire, LF_has_flood, LF_has_human_damage, LF_has_damaged_infrastructure, 
         LF_has_damaged_nature, LF_has_non_damage]

L = np.zeros((len(L_fns),loader.train_num)).astype(int)

for i in range(loader.train_num):
    L[0,i] = L_fns[0](p_keys['has_fire'][i])
    L[1,i] = L_fns[1](p_keys['has_flood'][i])
    L[2,i] = L_fns[2](p_keys['has_human_damage'][i])
    L[3,i] = L_fns[3](p_keys['has_damaged_infrastructure'][i])
    L[4,i] = L_fns[4](p_keys['has_damaged_nature'][i])
    L[5,i] = L_fns[5](p_keys['has_non_damage'][i])
    
        

In [39]:
L_train = sparse.csr_matrix(L.T)

Run the generative model using these labelling functions.

In [40]:
gen_model = GenerativeModel()

gen_model.train(L.T, epochs=100, decay=0.95, step_size= 0.01/ L.shape[1], reg_param=1e-6)
train_marginals = gen_model.marginals(L_train)

Inferred cardinality: 6


In [41]:
# Code for selecting the label with highest probability
def get_best_prob(probabilities):
    largest_prob = max(probabilities)
    largest_indices = []
    
    for index in range(len(probabilities)):
        if probabilities[index] == largest_prob:
            largest_indices.append(index)
    
    # If there is a single largest value, return it
    if len(largest_indices) == 1:
        return largest_indices[0]
    
    # Otherwise randomly select a lable from the labels with the largest probability
    else:
        return random.choice(largest_indices)

Compare the generated labels to the ground truths.

In [42]:
def compare_training_data(train_marginals):
    
    labels = ['fires', 'flood', 'human_damage', 'damaged_infrastructure', 'damaged_nature', 'non_damage']

    count = 0
    failures = {}

    for i in range(loader.train_num):
        predicted = labels[get_best_prob(train_marginals[i])]
        truth = loader.get_truth(i)

        if (truth != predicted):
            key = truth + ":" + predicted
            if key in failures:
                failures[key] += 1
            else:
                failures[key] = 1

            count += 1

    sorted_x = sorted(failures.items(), key=operator.itemgetter(1))
    print(sorted_x)
    print("% of correct predictions: ", str((1 - count/loader.train_num) * 100))
    
compare_training_data(train_marginals)

[('damaged_nature__damaged_infrastructure', 1), ('flood__damaged_infrastructure', 1), ('human_damage__flood', 1), ('fires__damaged_infrastructure', 2), ('non_damage__flood', 4), ('non_damage__damaged_nature', 4), ('flood__damaged_nature', 5), ('non_damage__fires', 5), ('damaged_nature__fires', 11), ('human_damage__non_damage', 11), ('damaged_infrastructure__fires', 15), ('damaged_infrastructure__damaged_nature', 17), ('fires__non_damage', 20), ('damaged_nature__human_damage', 21), ('flood__human_damage', 24), ('fires__flood', 27), ('damaged_infrastructure__non_damage', 28), ('flood__non_damage', 29), ('fires__human_damage', 31), ('damaged_infrastructure__flood', 42), ('damaged_infrastructure__human_damage', 43), ('non_damage__human_damage', 44), ('damaged_nature__non_damage', 53), ('damaged_nature__flood', 54)]
% of correct predictions:  72.53481894150417


Create clustering Labelling Functions

In [62]:
clusters_paths = ['./clustering_1/',
                  './clustering_2/',
                  './clustering_3/',
                  './clustering_4/']

# In this case, the foldername of the cluster is the labelling of its elements
def get_foldername(filename, clusters_path):   
    sub_dirs = [name for name in os.listdir(clusters_path) if os.path.isdir(os.path.join(clusters_path, name))]

    for sub_dir in sub_dirs:
        sub_dir_path = os.path.join(clusters_path, sub_dir)
        sub_sub_dirs = [name for name in os.listdir(sub_dir_path) if os.path.isdir(os.path.join(sub_dir_path, name))]

        for sub_sub_dir in sub_sub_dirs:
            sub_sub_dir_path=os.path.join(sub_dir_path, sub_sub_dir)

            files = os.listdir(sub_sub_dir_path)

            if os.path.basename(filename) in files:
                return sub_sub_dir_path
    return "None"

def clustering(filename, similarity_level):
    
    # Here we assume the image filename is located in the cluster folder 
    # that is named with one of the class labels
    clusters_path = clusters_paths[similarity_level]
    cluster_folder = get_foldername(filename, clusters_path)

    if cluster_folder == "None": 
        return 0
    else:
        return cluster_folder

In [63]:
def create_primitives(loader):
    m = 10 # number of primitives
    primitive_mtx = np.zeros((loader.train_num,m))

    for i in range(loader.train_num):
        primitive_mtx[i,0] = mentions_fire(loader.get_file_contents(i))
        primitive_mtx[i,1] = mentions_damaged_flood(loader.get_file_contents(i))
        primitive_mtx[i,2] = mentions_human_damage(loader.get_file_contents(i))
        primitive_mtx[i,3] = mentions_damaged_infrastructure(loader.get_file_contents(i))
        primitive_mtx[i,4] = mentions_damaged_nature(loader.get_file_contents(i))
        primitive_mtx[i,5] = non_damage(loader.get_file_contents(i))
        primitive_mtx[i,6] = clustering(loader.get_img_filename(i), 0)
        primitive_mtx[i,7] = clustering(loader.get_img_filename(i), 1)
        primitive_mtx[i,8] = clustering(loader.get_img_filename(i), 2)
        primitive_mtx[i,9] = clustering(loader.get_img_filename(i), 3)
        
    return primitive_mtx

primitive_mtx = create_primitives(loader)

p_keys = {
    'has_fire': primitive_mtx[:,0],
    'has_flood': primitive_mtx[:,1],
    'has_human_damage': primitive_mtx[:,2],
    'has_damaged_infrastructure': primitive_mtx[:,3],
    'has_damaged_nature': primitive_mtx[:,4],
    'has_non_damage': primitive_mtx[:,5],
    'has_cluster' :  primitive_mtx[:,6],
    'has_cluster_2' :  primitive_mtx[:,7],
    'has_cluster_3' :  primitive_mtx[:,8],
    'has_cluster_4' :  primitive_mtx[:,9]
   }

def LF_has_cluster(has_cluster):
    return has_cluster

def LF_has_cluster_2(has_cluster):
    return has_cluster

def LF_has_cluster_3(has_cluster):
    return has_cluster

def LF_has_cluster_4(has_cluster):
    return has_cluster


{0.0: 1259, 1.0: 23, 2.0: 26, 3.0: 99, 4.0: 84, 5.0: 43, 6.0: 261}


In [64]:
L_fns = [LF_has_fire, LF_has_flood, LF_has_human_damage, LF_has_damaged_infrastructure, 
         LF_has_damaged_nature, LF_has_non_damage, LF_has_cluster, LF_has_cluster_2, LF_has_cluster_3,
        LF_has_cluster_4]

L = np.zeros((len(L_fns),loader.train_num)).astype(int)

for i in range(loader.train_num):
    L[0,i] = L_fns[0](p_keys['has_fire'][i])
    L[1,i] = L_fns[1](p_keys['has_flood'][i])
    L[2,i] = L_fns[2](p_keys['has_human_damage'][i])
    L[3,i] = L_fns[3](p_keys['has_damaged_infrastructure'][i])
    L[4,i] = L_fns[4](p_keys['has_damaged_nature'][i])
    L[5,i] = L_fns[5](p_keys['has_non_damage'][i])
    L[6,i] = L_fns[6](p_keys['has_cluster'][i])
    L[7,i] = L_fns[7](p_keys['has_cluster_2'][i])
    L[8,i] = L_fns[8](p_keys['has_cluster_3'][i])
    L[9,i] = L_fns[9](p_keys['has_cluster_4'][i])
  
L_train = sparse.csr_matrix(L.T)

Run the generative model and compare to ground truths (if all is well- should improve over the previous result).

In [65]:
gen_model = GenerativeModel()

gen_model.train(L.T, epochs=100, decay=0.95, step_size= 0.01/ L.shape[1], reg_param=1e-6)
train_marginals = gen_model.marginals(L_train)

compare_training_data(train_marginals)

Inferred cardinality: 6


Create the Snorkel-generated labelled data

In [67]:
# Write training data
def overwrite_dir(dir_path):
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    os.makedirs(dir_path)

img_folder = ''
text_folder = ''
combined_folder = ''

overwrite_dir(img_folder)
overwrite_dir(text_folder)
overwrite_dir(combined_folder)

labels = ['fires', 'flood', 'human_damage', 'damaged_infrastructure', 'damaged_nature', 'non_damage']

for label in labels:
    os.makedirs(img_folder + label)
    os.makedirs(text_folder + label)
    os.makedirs(combined_folder + label)
    os.makedirs(combined_folder + label + "/images")
    os.makedirs(combined_folder + label + "/text")
    

for i in range(loader.train_num):
    predicted_label = labels[get_best_prob(train_marginals[i])]
    
    img_filename = loader.get_img_filename(i)
    dst = img_folder + predicted_label + "/" + os.path.basename(img_filename)
    copyfile(img_filename, dst)
    dst = combined_folder + predicted_label + "/images/" + os.path.basename(img_filename)
    copyfile(img_filename, dst)
    
    text_filename = loader.get_text_filename(i)
    dst = text_folder + predicted_label + "/" + os.path.basename(text_filename)
    copyfile(text_filename, dst)
    dst = combined_folder + predicted_label + "/text/" + os.path.basename(text_filename)
    copyfile(text_filename, dst)    

   