In [6]:
import sys
sys.path.append('/Users/nissani/Desktop/Hateful_Memes_Project/LoadingData')
sys.path.append('/Users/nissani/Desktop/Hateful_Memes_Project/FeatureGeneration')
import os
import LoadingData
from FER_featurizer import FER_Wrapper
from hate_speech import HateWrapper
from sentence_encoder import SentenceTransformer
from flair.models import TextClassifier
from flair.data import Sentence
import torch
import torchvision

In [7]:
train_data = LoadingData.LoadingData('/Users/nissani/Desktop/Hateful_Memes_Project/data/train.jsonl')
dev_data = LoadingData.LoadingData('/Users/nissani/Desktop/Hateful_Memes_Project/data/dev.jsonl')
image_features = LoadingData.LoadingData('/Users/nissani/Desktop/Hateful_Memes_Project/data/cleaned_getty_data.csv')

In [8]:
train_data = train_data.load_data()

In [9]:
dev_data = dev_data.load_data()

In [10]:
image_features = image_features.load_data()
image_features = image_features.drop('Unnamed: 0', axis = 1)

## Cleaning Getty Data

In order for the sentiment analyzer to work correctly, when we do not have any text, we put in "okay" as a neutral word for a neutral sentiment.

In [11]:
image_features = image_features.fillna('')

In [12]:
best_captions = []
best_tags = []
for best_caption, best_tag, src in zip(list(image_features.best_caption), list(image_features.best_tags), list(image_features.src)):
    if ((not best_caption) and (not best_tag)):
        best_captions.append('okay')
        best_tags.append('okay')
    elif not best_caption:
        best_captions.append(best_tag)
        best_tags.append(best_tag)
    elif not best_tag:
        best_captions.append(best_caption)
        best_tags.append(best_caption)
    else:
        best_captions.append(best_caption)
        best_tags.append(best_tag)

## Feature Generation

This function does multiple things:

1) It instantiates multiple objects that featurize the data differently. FER creates emotion features when available. Hate creates hate and offensive scores. These are the first lines of code, and new objects should be instantiated in the same place.

2) The next major part is the four loop. Here we create all the features for each component of text. Note, that we have the captions, the tags, and the meme text. Each component should be separate (as opposed to the baseline), and we should make sure to keep them separate in each feature generation step.

3) The last part of this function makes the entire list of dictionaries into a dictionary of dictionaries, where the key is the picture id and the dictionary contains all the information and features we could want about the images.

In [17]:
def create_features(data, captions, tags):
    FER = FER_Wrapper()
    hate = HateWrapper()
    sentiment_classifier = TextClassifier.load('en-sentiment')
    
    aggregate_text = []
    for el in data:
        aggregate_text.append(el['text'])

    meme_hate_scores = hate.predict(aggregate_text)
    caption_hate_scores = hate.predict(captions)
    tag_hate_scores = hate.predict(tags)
    for meme, caption, tag in zip(data, captions, tags):
        meme['meme_hate_speech'] = meme_hate_scores[meme_hate_scores.text == meme['text']].hate_speech
        meme['meme_offensive_language'] = meme_hate_scores[meme_hate_scores.text == meme['text']].offensive_language
        meme['meme_neither'] = meme_hate_scores[meme_hate_scores.text == meme['text']].neither
        meme['caption_hate_speech'] = caption_hate_scores[caption_hate_scores.text == caption].hate_speech
        meme['caption_offensive_language'] = caption_hate_scores[caption_hate_scores.text == caption].offensive_language
        meme['caption_neither'] = caption_hate_scores[caption_hate_scores.text == caption].neither
        meme['tag_hate_speech'] = tag_hate_scores[tag_hate_scores.text == tag].hate_speech
        meme['tag_offensive_language'] = tag_hate_scores[tag_hate_scores.text == tag].offensive_language
        meme['tag_neither'] = tag_hate_scores[tag_hate_scores.text == tag].neither
    
        sentence = Sentence(meme['text'])
        sentiment_classifier.predict(sentence)
        label = str(sentence.labels[0]).split(' ')[0]
        proba = float(str(sentence.labels[0]).split(' ')[1].replace('(', '').replace(')', ''))
        meme['meme_sentiment'] = [label, proba]
        
        sentence = Sentence(caption)
        sentiment_classifier.predict(sentence)
        label = str(sentence.labels[0]).split(' ')[0]
        proba = float(str(sentence.labels[0]).split(' ')[1].replace('(', '').replace(')', ''))
        meme['caption_sentiment'] = [label, proba]
        
        sentence = Sentence(tag)
        sentiment_classifier.predict(sentence)
        label = str(sentence.labels[0]).split(' ')[0]
        proba = float(str(sentence.labels[0]).split(' ')[1].replace('(', '').replace(')', ''))
        meme['tag_sentiment'] = [label, proba]
        
        path = '/Users/nissani/Desktop/Hateful_Memes_Project/data/' + meme['img']
        emotion_feature = FER.run_FER(path)
        img_id = meme['img'].split('/')[1]
        meme['emotion_feature'] = emotion_feature[img_id]
        
    new_data = {item['id'] : item for item in data}
        
    return new_data

In [18]:
featurized_train_data = create_features(train_data, best_captions, best_tags)



2020-08-30 15:06:11,154 loading file /Users/nissani/.flair/models/sentiment-en-mix-distillbert.pt


30-08-2020:15:06:11,392 INFO     [configuration_utils.py:265] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at /Users/nissani/.cache/torch/transformers/a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.8949e27aafafa845a18d98a0e3a88bc2d248bbc32a1b75947366664658f23b1c
30-08-2020:15:06:11,393 INFO     [configuration_utils.py:301] Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}

30-08-2020:15:06:11,578 INFO     [tokenization_utils.py:1022] loading file https://s3.amazonaws.com/mod

worked
