In [2]:
!python3 --version

Python 3.8.10


In [3]:
import os
from pathlib import Path

import pandas as pd
import numpy as np

### Loading Data

In [4]:
# Dataset Paths
DATASET_PATH = Path('/mnt/DATA/fharookshaik/major_project/dataset')
TRAIN_IMAGES_DIR_PATH = os.path.join(DATASET_PATH,'Train','images')

TRAIN_CSV_PATH = os.path.join(DATASET_PATH,'Train','train.csv')
VALIDATE_CSV_PATH = os.path.join(DATASET_PATH,'Train','val.csv')

TEST_IMAGES_DIR_PATH = os.path.join(DATASET_PATH,'Test','images')
TEST_CSV_PATH = os.path.join(DATASET_PATH,'Test','test.csv')

In [5]:
train_df = pd.read_csv(TEST_CSV_PATH)

In [6]:
train_df.head()

Unnamed: 0,OCR,image,entity_list
0,FRn/TP THOSE WHo WANT TO REAP THE BNEFITS OP T...,memes_4576.png,"['nation', 'thomas paine']"
1,Den Ta r iterts pt (OBAMA) KICKED ME OUT mIMPL...,covid_memes_5512.png,"['donald trump', 'barack obama', 'trump', 'oba..."
2,WHAT PARTY ARE YOU RUNNING ASP DEMOCRAT. IN TE...,memes_1285.png,"['texas', 'democrat', 'meme']"
3,You can have any virus you want as long as it'...,covid_memes_225.png,"['corona', 'coronavirus', 'virus']"
4,Telkonia 1920 tacebook.com Comment Keleabetswe...,covid_memes_592.png,"['bra', 'electricity']"


In [7]:
train_df.tail()

Unnamed: 0,OCR,image,entity_list
713,DEMOCRATS HAVE SOME SERIOUS CANDIDATES FOR 202...,memes_1408.png,"['democrats', 'democrat candidates', '2020']"
714,ETS BUID AWAL GF DETA,memes_8246.png,['donald trump']
715,CRIMINALS BEFORE GUN BAN CRIMINALS AFTER GUN BAN,memes_3602.png,"['gun ban', 'gun', 'criminals']"
716,Whenhance sees you thooughy wang your hands at...,covid_memes_1625.png,['2020']
717,PeterSweden @PeterSweden? This is Amanda Lind ...,memes_1991.png,"['sweden', 'minister of culture', 'green party..."


### Cleaning Data

In [8]:
# # Uncomment this cell for Train Data
# train_df['OCR'] = train_df['OCR'].fillna("")
# train_df['hero'] = train_df['hero'].fillna({i: [] for i in train_df.index})
# train_df['villain'] = train_df['villain'].fillna({i: [] for i in train_df.index})
# train_df['victim'] = train_df['victim'].fillna({i: [] for i in train_df.index})
# train_df['other'] = train_df['other'].fillna({i: [] for i in train_df.index})

# train_df

Unnamed: 0,OCR,image,entity_list
0,FRn/TP THOSE WHo WANT TO REAP THE BNEFITS OP T...,memes_4576.png,"['nation', 'thomas paine']"
1,Den Ta r iterts pt (OBAMA) KICKED ME OUT mIMPL...,covid_memes_5512.png,"['donald trump', 'barack obama', 'trump', 'oba..."
2,WHAT PARTY ARE YOU RUNNING ASP DEMOCRAT. IN TE...,memes_1285.png,"['texas', 'democrat', 'meme']"
3,You can have any virus you want as long as it'...,covid_memes_225.png,"['corona', 'coronavirus', 'virus']"
4,Telkonia 1920 tacebook.com Comment Keleabetswe...,covid_memes_592.png,"['bra', 'electricity']"
...,...,...,...
713,DEMOCRATS HAVE SOME SERIOUS CANDIDATES FOR 202...,memes_1408.png,"['democrats', 'democrat candidates', '2020']"
714,ETS BUID AWAL GF DETA,memes_8246.png,['donald trump']
715,CRIMINALS BEFORE GUN BAN CRIMINALS AFTER GUN BAN,memes_3602.png,"['gun ban', 'gun', 'criminals']"
716,Whenhance sees you thooughy wang your hands at...,covid_memes_1625.png,['2020']


In [None]:
# Uncomment this cell for Test Data
train_df['OCR'] = train_df['OCR'].fillna("")
train_df['entity_list'] = train_df['entity_list'].fillna({i: [] for i in train_df.index})

train_df

### NER

In [10]:
import re
import nltk.corpus
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fharookshaik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fharookshaik/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:
from nltk.corpus import stopwords

def sentences_to_nouns(sentence):
    '''Returns Nouns from Sentences'''
    # case normalization
    sentence = sentence.lower()

    # Remove unwanted chracters
    sentence = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", sentence)

    # Remove Stopwords
    stop = stopwords.words('english')
    sentence = " ".join([word for word in sentence.split() if word not in (stop)])

    # Tokenize sentence
    list_tokns = sentence.split()

    # Find nouns from sentence
    pos = nltk.pos_tag(list_tokns)
    is_noun = lambda pos: pos[:2] == 'NN'
    # do the nlp stuff
    nouns = [word for (word,pos) in nltk.pos_tag(list_tokns) if is_noun(pos)]

    return nouns

In [13]:
# Loading Entities from Dataframe
entities = []
for idx,val in train_df.iterrows():
    entities.append([word.strip('\'') for word in train_df['entity_list'][idx].strip("][").split(", ")])

entities

[['nation', 'thomas paine'],
 ['donald trump', 'barack obama', 'trump', 'obama'],
 ['texas', 'democrat', 'meme'],
 ['corona', 'coronavirus', 'virus'],
 ['bra', 'electricity'],
 ['mask'],
 ['barack obama',
  'americans',
  'donald trump',
  'h1n1',
  'h1n1 virus',
  'covid19'],
 ['china', 'donald trump', 'united states', '2020', 'covid19'],
 ['genovia', 'covid19'],
 ['donald trump'],
 ['face mask', 'wuhan', 'poker'],
 ['ben carson',
  'chris christie',
  'grand old party (gop) debate',
  'arco rubio',
  'mike huckabee',
  'rand paul',
  'donald trump',
  'carly fiorina',
  'marco rubio',
  'jeb bush',
  'ted cruz'],
 ['donald trump', 'people in america', 'america', 'trump virus', 'covid19'],
 ['joe biden', 'democratic party'],
 ['2020'],
 ['private company',
  'company',
  'bonus',
  'government',
  'diwali',
  'government company'],
 ['barack obama', 'joe biden', 'politics', 'donald trump', 'people'],
 ['barack obama', 'joe biden'],
 ['chinese', 'mask', 'chinese guy'],
 ['donald trump'

In [14]:
len(entities)

718

In [16]:
# Loading the OCR text data from Dataframe to sentences

sentences = []

for idx,val in train_df.iterrows():
    text = val.get('OCR').lower()
    text = text.split(".")

    print(text)

    sentences.append(text)

['frn/tp those who want to reap the bnefits op this great nation must bear the fatigue of supporting it', ' thomas paina fost patier 17 nna ']
['den ta r iterts pt (obama) kicked me out mimplinvited me back ']
['what party are you running asp democrat', ' in texas? merme creator - funny what party are you running as? democrat', ' in ', ' ']
["you can have any virus you want as long as it's a corona memedroid "]
['telkonia 1920 tacebook', "com comment keleabetswe kekana ma-kele's almstaying ", '', '', ' they can takb our electricity but they will never take our braai imegenerator', 'net w00 vu and 38k others 77 comments 12k shares cemment share academy of digital arts ']
['']
['under obama hin1 killed 12,469 americans and nobody bats an eye under trump covid-19 has killed 36 americans and everyone loses their minds ']
['134 ken capelandis wind of god remix - wtfarahh keny bey here is tack apnwitaew so l conwet china unieashed a virui on the world', ' make no mistake- their main objectiv

In [17]:
len(sentences)

718

### Entity Sentence Linking

In [22]:
# Linking entities to sentences

enty_sent_dict = {}

for i,entity in enumerate(entities):
  #print(i)
  per_meme_dict = {}
  for entity_per_meme in entity:
      #print("entity = "+ entity_per_meme)
      temp_sent_list = []
      for sentence in sentences[i]:
        sentence = sentence.lower()
        sentence = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", sentence)
        words = sentence.split(" ")
        #print(words)
        if entity_per_meme in words:
          #print( "Sentence = " + sentence)

          entity_index = words.index(entity_per_meme)
          #print("Index = " + str(entity_index))

          sentence_01 = words[entity_index-2:entity_index]
          sentence_02 = words[entity_index:entity_index+4]
          sentence_ = sentence_01 + sentence_02
          sentence_ = ' '.join(sentence_)
          #print("Sentence_ = "+str(sentence_))

          temp_sent_list.append(sentence_)

      per_meme_dict[entity_per_meme] = temp_sent_list
  #print(per_meme_dict)
  enty_sent_dict[train_df['image'][i]] = per_meme_dict

#print("Whole dictionary = ")
#print(enty_sent_dict)

In [25]:
# Resulting out entity sentence linking to a json file

import json
with open(os.path.join(DATASET_PATH,'windowed_enty_sent_linking.json'),'w') as f:
    json.dump(enty_sent_dict,f,indent=4)

In [26]:
print("length of dictionary = ", len(enty_sent_dict))

length of dictionary =  718


In [27]:
# Loading the entity sentence linking json file
ENTITY_SENT_LINKED = os.path.join(DATASET_PATH,'windowed_enty_sent_linking.json')
ENTITY_SENT_LINKED_FILE = open(ENTITY_SENT_LINKED)


data = json.load(ENTITY_SENT_LINKED_FILE)
# print(data)

{'memes_4576.png': {'nation': ['this great nation must bear the'], 'thomas paine': []}, 'covid_memes_5512.png': {'donald trump': [], 'barack obama': [], 'trump': [], 'obama': ['pt  obama  kicked me']}, 'memes_1285.png': {'texas': [' in texas  merme creator'], 'democrat': ['running asp democrat', 'as  democrat'], 'meme': []}, 'covid_memes_225.png': {'corona': ['s a corona memedroid '], 'coronavirus': [], 'virus': ['have any virus you want as']}, 'covid_memes_592.png': {'bra': [], 'electricity': ['takb our electricity but they will']}, 'covid_memes_2449.png': {'mask': []}, 'covid_memes_5651.png': {'barack obama': [], 'americans': ['12 469 americans and nobody bats'], 'donald trump': [], 'h1n1': [], 'h1n1 virus': [], 'covid19': []}, 'covid_memes_5569.png': {'china': ['l conwet china unieashed a virui'], 'donald trump': [], 'united states': [], '2020': ['like trump 2020  '], 'covid19': []}, 'covid_memes_1839.png': {'genovia': ['ryantempletbh apr genovia has not reported'], 'covid19': []}, 

### Framework-1(Based on Sentiment Analysis)

In [28]:
# Resolving Dependencies
!pip install vaderSentiment



In [29]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [30]:

def sentiment_scores(sentence):
    """A simple function to calculate sentiments of the sentence"""
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    #print("Overall sentiment dictionary is : ", sentiment_dict)

    """
    print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
    print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
    print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
    """
 
    #print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
    """
    if sentiment_dict['compound'] >= 0.025 :
        print("Positive")
 
    elif sentiment_dict['compound'] >= - 0.05 and sentiment_dict['compound'] <= -0.025:
        print("Negative + Victim")

    elif sentiment_dict['compound'] <= - 0.05 :
        print("Negative + Villain")
 
    else :
        print("Neutral")
    """

    return sentiment_dict['compound']

In [31]:
# Performing Role labelling using calculated sentiments from sentence
result_dict = {}

for meme_name,ocr in zip(train_df['image'],train_df['OCR']):
  #print(meme_name)

  per_meme_dict = {"image": meme_name,
                   "OCR": ocr,
                   "hero": [],
                   "villain": [],
                   "victim": [],
                   "other": []}

  meme = data[meme_name]
  #print(data[meme_name])
  for entity in meme:
    #print("Entity = " + entity)
    #print(meme[entity])
    sentence_list = meme[entity]

    sentiment_score_calculated = 0
    no_of_sentences = 0
    for sentence in sentence_list:
      #print("Sentence = " + str(sentence))

      sentiment_score_calculated = sentiment_score_calculated + sentiment_scores(sentence)
      no_of_sentences = no_of_sentences + 1
    try:
      sentiment_score_calculated = sentiment_score_calculated / no_of_sentences
      #print(sentiment_score_calculated)
    except ZeroDivisionError:
      sentiment_score_calculated = 0

    if sentiment_score_calculated >= 0.05 :
      per_meme_dict["hero"].append(entity)
    elif sentiment_score_calculated >= - 0.05 and sentiment_score_calculated <= -0.025:
      per_meme_dict["victim"].append(entity)
    elif sentiment_score_calculated <= - 0.05 :
      per_meme_dict["villain"].append(entity)
    else :
      per_meme_dict["other"].append(entity)

  #print(per_meme_dict)
  result_dict[meme_name] = per_meme_dict
  #print("\n")

print(result_dict)



In [32]:
print("length of dictionary = ", len(result_dict))

length of dictionary =  718


In [33]:
# Resulting out the Windowed Sentiment Analysis to a json file
with open(os.path.join(DATASET_PATH,'result_Windowed_sentiment_analysis.json'),'w') as f:
    json.dump(result_dict,f,indent=4)

### Framework-II (Based on Similarity Scores)

In [34]:
# Maintaining Role Dictionaries

HERO_DICT = {'gentle', 'preserving', 'leadership', 'amazing', 'devoted', 'humble', 'warned', 'surprised', 'humanity', 'brave', 'evacuate', 'redemption', 'smile', 'honor', 'revolutionize', 'leader', 'advocate', 'savior', 'charity', 'sympathies', 'kindness', 'good', 'protect', 'teach', 'reputation', 'respected', 'welfare', 'glory', 'victory', 'winner', 'well', 'contained', 'restoration', 'commitment', 'ability', 'efforts', 'inspire', 'safety', 'allies', 'health', 'strength', 'empowered', 'passion', 'encouraging', 'warm', 'vision', 'scored', 'authorities', 'justice', 'grand', 'admire', 'reshape', 'communities', 'response', 'strengthen', 'bolster', 'intervened', 'motivated', 'reconstruct', 'freedom', 'duty', 'aided', 'conquer', 'smart', 'bravery', 'improve', 'donate', 'wise', 'ingenuity', 'milestone', 'protections', 'expand', 'hero', 'pursuit', 'invent', 'containment', 'achievement', 'supporters'}

VILLAIN_DICT = {'contaminate', 'dirty', 'abduct', 'terror', 'worsen', 'crisis', 'lambast', 'abandonment', 'harass', 'subvert', 'virus', 'crime', 'provoke', 'kidnap', 'manipulate', 'alleged', 'refusal', 'trafficking', 'marginalize', 'conformity', 'clampdown', 'villain', 'disparaged', 'cold', 'exacerbate', 'alienate', 'commit', 'trial', 'violence', 'denounced', 'stripped', 'undermine', 'seize', 'persecuted', 'opposing', 'intimidate', 'jailed', 'fool', 'investigation', 'imprisoned', 'bias', 'deception', 'gunshots', 'threaten', 'hoax', 'engulfed', 'blame', 'eruption', 'offensive', 'contempt', 'suggested', 'coercion', 'erase', 'catastrophe', 'rumors', 'weaken', 'pointed', 'treason', 'evil', 'abused', 'sentenced', 'bullet', 'warn', 'devastate', 'convicted', 'rebuke', 'reveal', 'bully', 'collude'}

VICTIM_DICT = {'setback', 'injured', 'traumatized', 'prevented', 'healing', 'buried', 'stuck', 'anguished', 'flee', 'suffer', 'casualty', 'trampled', 'forsaken', 'harassed', 'harassment', 'hardship', 'deported', 'howling', 'shocked', 'violence', 'depressed', 'danger', 'mute', 'stripped', 'terrified', 'distrust', 'assassinated', 'shivering', 'sick', 'complain', 'abducted', 'huddled', 'victimized', 'persecuted', 'barricaded', 'devastated', 'kidnapped', 'seized', 'justified', 'evacuated', 'surrendered', 'diagnosed', 'imprisoned', 'independence', 'slave', 'deceased', 'rebuffed', 'target', 'trapped', 'screamed', 'loss', 'trafficked', 'humiliated', 'impairment', 'wounded', 'discriminated', 'disadvantaged', 'blood', 'offended', 'accuses', 'saddens', 'threatened', 'disaster', 'devastation', 'overshadowed', 'tortured', 'abused', 'remonstrated', 'jeopardizing', 'stabbed', 'prey', 'sentenced', 'challenged', 'renounced', 'scared', 'humiliation', 'deaths', 'rescued', 'bleeding'}


In [35]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/fharookshaik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
from nltk.corpus import wordnet as wn

In [37]:
# Example function to calculate word similarity using Wu-Palmer Method
def word_similarity(word1, word2):
    '''
    Returns the Wu-Palmer similarity between the given words.
    Values range between 0 (least similar) and 1 (most similar).
    '''
    syns_w1 = wn.synsets(word1)
    syns_w2 = wn.synsets(word2)
    score = 0
    for w1 in syns_w1:
        for w2 in syns_w2:
            cur_score = w1.wup_similarity(w2)
            cur_score = w2.wup_similarity(w1) if not cur_score else cur_score
            if cur_score:
                score = max(score, cur_score)
    return score

In [38]:
# Creating Similarity Dictionary
similarity_dict = os.path.join(DATASET_PATH,'similarity_dictionary.json')
similarity_dict_file = open(similarity_dict)

similarity_data = json.load(similarity_dict_file)
# print(similarity_data)



In [39]:
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/fharookshaik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
# Calculating Similarity Score for Role labelling
result_dict = {}

assignment_dict = {0:"hero",
                   1:"villain",
                   2:"victim",
                   3:"other"}

for meme_name,ocr in zip(train_df['image'],train_df['OCR']):

  per_meme_dict = {"image": meme_name,
                   "OCR": ocr,
                   "hero": [],
                   "villain": [],
                   "victim": [],
                   "other": []}

  meme = data[meme_name]

  for entity in meme:
    sentence_list = meme[entity]

    similarity_list = [0, 0, 0]
    number_of_words = 0

    for sentence in sentence_list:
      words = word_tokenize(sentence)

      for word in words:
        number_of_words = number_of_words + 1

        try:
          if similarity_data[word]:
            similarity_list[0] = similarity_list[0] + similarity_data[word][0]
            similarity_list[1] = similarity_list[1] + similarity_data[word][1]
            similarity_list[2] = similarity_list[2] + similarity_data[word][2]
        
        except:
            pass


    max_value = max(similarity_list)
    max_index = similarity_list.index(max_value)

    difference = similarity_list[0] - similarity_list[1]
    difference = abs(difference) + abs(similarity_list[0] - similarity_list[2])

    if difference < (0.5229945):
      per_meme_dict['other'].append(entity)

    else:
      per_meme_dict[assignment_dict[max_index]].append(entity)


  result_dict[meme_name] = per_meme_dict

# print(result_dict)



In [41]:
# Exporting Results to json file
with open(os.path.join(DATASET_PATH,'result_Windowed_similarity_dictionary.json'),'w') as f:
    json.dump(result_dict,f,indent=4)

In [42]:
print(len(result_dict))

718
