In [1]:
import sys
sys.path.append('/Users/nissani/Desktop/Hateful_Memes_Project/LoadingData')
sys.path.append('/Users/nissani/Desktop/Hateful_Memes_Project/FeatureGeneration')
import os
import LoadingData
from FER_featurizer import FER_Wrapper
from hate_speech import HateWrapper
from sentence_encoder import SentenceTransformer
from flair.models import TextClassifier
from flair.data import Sentence
import torch
import torchvision
import sentence_encoder
import protected_classifier_naive
import getty_simple_featurizer
import json
import numpy as np

Instructions for updating:
non-resource variables are not supported in the long term


02-09-2020:01:16:00,50 INFO     [file_utils.py:39] PyTorch version 1.4.0 available.
02-09-2020:01:16:00,50 INFO     [file_utils.py:55] TensorFlow version 2.2.0 available.
[nltk_data] Downloading package punkt to /Users/nissani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
train_data = LoadingData.LoadingData('/Users/nissani/Desktop/Hateful_Memes_Project/data/train.jsonl')
dev_data = LoadingData.LoadingData('/Users/nissani/Desktop/Hateful_Memes_Project/data/dev.jsonl')
test_data = LoadingData.LoadingData('/Users/nissani/Desktop/Hateful_Memes_Project/data/test.jsonl')
image_features = LoadingData.LoadingData('/Users/nissani/Desktop/Hateful_Memes_Project/data/cleaned_getty_data.csv')

In [3]:
train_data = train_data.load_data()

In [4]:
dev_data = dev_data.load_data()

In [5]:
test_data = test_data.load_data()

In [6]:
image_features = image_features.load_data()
image_features = image_features.drop('Unnamed: 0', axis = 1)

In [7]:
len(image_features)

9983

## Cleaning Getty Data

In order for the sentiment analyzer to work correctly, when we do not have any text, we put in "okay" as a neutral word for a neutral sentiment.

In [8]:
image_features = image_features.fillna('')

In [9]:
best_captions = {}
best_tags = {}
for best_caption, best_tag, src in zip(list(image_features.best_caption), list(image_features.best_tags), list(image_features.src)):
    if ((not best_caption) and (not best_tag)):
        best_captions[src] = 'okay'
        best_tags[src] = 'okay'
    elif not best_caption:
        best_captions[src] = best_tag
        best_tags[src] = best_tag
    elif not best_tag:
        best_captions[src] = best_caption
        best_tags[src] = best_caption
    else:
        best_captions[src] = best_caption
        best_tags[src] = best_tag

In [10]:
def add_captions_tags(data, tags, captions):
    for el in data:
        identifier = el['id']
        try:
            el['tag'] = tags[identifier]
        except:
            el['tag'] = 'no tag'
        try:
            el['caption'] = captions[identifier]
        except:
            el['caption'] = 'no caption'
        
    return data

In [11]:
train_data = add_captions_tags(train_data, best_tags, best_captions)
test_data = add_captions_tags(test_data, best_tags, best_captions)
dev_data = add_captions_tags(dev_data, best_tags, best_captions)

## Feature Generation

This function does multiple things:

1) It instantiates multiple objects that featurize the data differently. FER creates emotion features when available. Hate creates hate and offensive scores. These are the first lines of code, and new objects should be instantiated in the same place.

2) The next major part is the four loop. Here we create all the features for each component of text. Note, that we have the captions, the tags, and the meme text. Each component should be separate (as opposed to the baseline), and we should make sure to keep them separate in each feature generation step.

3) The last part of this function makes the entire list of dictionaries into a dictionary of dictionaries, where the key is the picture id and the dictionary contains all the information and features we could want about the images.

Inputs: data (json lines), captions (array of strings), tags (array of strings)

In [15]:
def create_features(data):
    FER = FER_Wrapper()
    hate = HateWrapper()
    sentiment_classifier = TextClassifier.load('en-sentiment')
    glove_encoder = sentence_encoder.GettySentenceTransform("average_word_embeddings_glove.6B.300d")
    bert_encoder = sentence_encoder.GettySentenceTransform("roberta-base-nli-stsb-mean-tokens")
    protected_classifier = protected_classifier_naive.ProtectedClassifierSimple(bert_encoder)
    getty_features = getty_simple_featurizer.GettySimpleWrapper()
    
    aggregate_text = []
    aggregate_captions = []
    aggregate_tags = []
    for el in data:
        aggregate_text.append(el['text'])
        aggregate_tags.append(el['tag'])
        aggregate_captions.append(el['caption'])
    
    #hate scores
    meme_hate_scores = hate.predict(aggregate_text)
    caption_hate_scores = hate.predict(aggregate_captions)
    tag_hate_scores = hate.predict(aggregate_tags)
    
    #word embeddings
    tag_feature_vectors = glove_encoder.embed_glove_tags(glove_encoder, aggregate_tags, None, len(aggregate_tags))
    bert_feature_vectors = bert_encoder.embed_column(bert_encoder, aggregate_captions, None, len(aggregate_captions))
    bert_feature_vectors = torch.Tensor(bert_feature_vectors).view((len(aggregate_captions),768))
    meme_feature_vectors = bert_encoder.embed_column(bert_encoder, aggregate_text, None, len(aggregate_text))
    meme_feature_vectors = torch.Tensor(meme_feature_vectors).view((len(aggregate_text),768))
    
    #protected classifier
    protected_memes = protected_classifier.measure_distance(aggregate_text)
    protected_captions = protected_classifier.measure_distance(aggregate_captions)
    
    #getty features
    getty = getty_features.get_simple_getty_features()
    getty = getty.fillna(0)
    
    for meme, caption, tag, tag_vector, bert_vector, meme_vector, meme_scores, caption_scores in zip(data, 
                                                                                                    aggregate_captions, 
                                                                                                    aggregate_tags, 
                                                                                                    tag_feature_vectors, 
                                                                                                    bert_feature_vectors,
                                                                                                    meme_feature_vectors,
                                                                                                    protected_memes,
                                                                                                    protected_captions):
        
        meme['meme_hate_speech'] = float(meme_hate_scores[meme_hate_scores.text == meme['text']].hate_speech.values[0])
        meme['meme_offensive_language'] = float(meme_hate_scores[meme_hate_scores.text == meme['text']].offensive_language.values[0])
        meme['meme_neither'] = float(meme_hate_scores[meme_hate_scores.text == meme['text']].neither.values[0])
        meme['caption_hate_speech'] = float(caption_hate_scores[caption_hate_scores.text == caption].hate_speech.values[0])
        meme['caption_offensive_language'] = float(caption_hate_scores[caption_hate_scores.text == caption].offensive_language.values[0])
        meme['caption_neither'] = float(caption_hate_scores[caption_hate_scores.text == caption].neither.values[0])
        meme['tag_hate_speech'] = float(tag_hate_scores[tag_hate_scores.text == tag].hate_speech.values[0])
        meme['tag_offensive_language'] = float(tag_hate_scores[tag_hate_scores.text == tag].offensive_language.values[0])
        meme['tag_neither'] = float(tag_hate_scores[tag_hate_scores.text == tag].neither.values[0])
    
        sentence = Sentence(meme['text'])
        sentiment_classifier.predict(sentence)
        label = str(sentence.labels[0]).split(' ')[0]
        proba = float(str(sentence.labels[0]).split(' ')[1].replace('(', '').replace(')', ''))
        meme['meme_sentiment'] = [label, proba]
        
        sentence = Sentence(caption)
        sentiment_classifier.predict(sentence)
        label = str(sentence.labels[0]).split(' ')[0]
        proba = float(str(sentence.labels[0]).split(' ')[1].replace('(', '').replace(')', ''))
        meme['caption_sentiment'] = [label, proba]
        
        sentence = Sentence(tag)
        sentiment_classifier.predict(sentence)
        label = str(sentence.labels[0]).split(' ')[0]
        proba = float(str(sentence.labels[0]).split(' ')[1].replace('(', '').replace(')', ''))
        meme['tag_sentiment'] = [label, proba]
        
        path = '/Users/nissani/Desktop/Hateful_Memes_Project/data/' + meme['img']
        emotion_feature = FER.run_FER(path)
        img_id = meme['img'].split('/')[1]
        meme['emotion_feature'] = emotion_feature[img_id]
        
        meme['tag_feature_vector'] = tag_vector.tolist()
        meme['caption_feature_vector'] = bert_vector.tolist()
        meme['meme_feature_vector'] = meme_vector.tolist()
        
        meme['protected_meme_scores'] = meme_scores.tolist()
        meme['protected_caption_scores'] = caption_scores.tolist()
        
    new_data = {item['id'] : item for item in data}
    getty_columns = list(getty.columns)
    
    for el in list(new_data.keys()):
        getty_point = getty[getty.id == str(el)]
        for name in getty_columns:
            new_data[el][name] = float(getty_point[name].values[0])
    
    return new_data

In [16]:
featurized_train_data = create_features(train_data)
featurized_dev_data = create_features(dev_data)
featurized_test_data = create_features(test_data)



2020-09-02 06:30:47,442 loading file /Users/nissani/.flair/models/sentiment-en-mix-distillbert.pt


02-09-2020:06:30:47,637 INFO     [configuration_utils.py:265] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at /Users/nissani/.cache/torch/transformers/a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c
02-09-2020:06:30:47,638 INFO     [configuration_utils.py:301] Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

02-09-2020:06:30:48,11 INFO     [tokenization_utils.py:1022] loading file https://s3.amazonaws.com/mode

2020-09-02 08:28:06,410 loading file /Users/nissani/.flair/models/sentiment-en-mix-distillbert.pt


02-09-2020:08:28:06,593 INFO     [configuration_utils.py:265] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at /Users/nissani/.cache/torch/transformers/a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c
02-09-2020:08:28:06,595 INFO     [configuration_utils.py:301] Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

02-09-2020:08:28:06,678 INFO     [tokenization_utils.py:1022] loading file https://s3.amazonaws.com/mod

2020-09-02 08:33:41,135 loading file /Users/nissani/.flair/models/sentiment-en-mix-distillbert.pt


02-09-2020:08:33:41,319 INFO     [configuration_utils.py:265] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at /Users/nissani/.cache/torch/transformers/a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c
02-09-2020:08:33:41,321 INFO     [configuration_utils.py:301] Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

02-09-2020:08:33:41,729 INFO     [tokenization_utils.py:1022] loading file https://s3.amazonaws.com/mod

In [14]:
for el in list(featurized_train_data['42953'].keys()):
    print(el)
    print(type(featurized_train_data['42953'][el]))

NameError: name 'featurized_train_data' is not defined

In [119]:
len(featurized_train_data)

8500

In [120]:
len(featurized_dev_data)

500

In [121]:
len(featurized_test_data)

1000

In [115]:
with open('train.json', 'w') as f:
    json.dump(featurized_train_data, f)

OSError: [Errno 28] No space left on device