In [None]:
!python3 --version

Python 3.7.12


In [None]:
import os
from pathlib import Path

import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PROJECT_DIR = "/content/drive/My Drive/Major_Project"

DATASET_PATH = Path(r'/content/drive/MyDrive/Major_Project/dataset')

# CSV Paths
TRAIN_CSV = os.path.join(DATASET_PATH,'train.csv')

In [None]:
train_df = pd.read_csv(TRAIN_CSV)

In [None]:
train_df.head()

Unnamed: 0,OCR,image,hero,villain,victim,other
0,Bernie or Elizabeth? Be informed.Compare them ...,covid_memes_18.png,,,,"['bernie sanders', 'elizabeth warren']"
1,Extending the Brexit deadline until October 31...,covid_memes_19.png,,['uk government'],,
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,covid_memes_252.png,['thais'],,,['hong kong']
3,"So, I order this foce mask to protect ogainst ...",covid_memes_255.png,,['china'],,"['face mask', 'made in china', 'coronavirus']"
4,best candidate for JA 2020 joe biden Kamala ha...,covid_memes_20.png,['joe biden'],,,"['bernie sanders', 'kamala harris', 'tiktok']"


In [None]:
train_df.tail()

Unnamed: 0,OCR,image,hero,villain,victim,other
5547,Trump could shoot someone on the Senate floor ...,memes_5039.png,,['donald trump'],,"['senate floor', 'republican']"
5548,MANY PEOPLE ASK ME WHY ALL MY SCHOOL RECORDS A...,memes_2635.png,,,,"['school', 'university', 'joe biden']"
5549,my bes friend my mother consclence my therapis...,memes_1384.png,,,,"['msnbc', 'bernie sanders', 'democratic party'..."
5550,THE N-WORD PASS Signed and approved by Beak Ob...,memes_944.png,,,,['barack obama']
5551,Biden-Obama Memes 300. Funny & Hillarious Meme...,memes_982.png,,,,"['biden obama meme', 'john robinson', 'memes',..."


In [None]:
# Cleaning data
train_df['OCR'] = train_df['OCR'].fillna("")
train_df['hero'] = train_df['hero'].fillna({i: [] for i in train_df.index})
train_df['villain'] = train_df['villain'].fillna({i: [] for i in train_df.index})
train_df['victim'] = train_df['victim'].fillna({i: [] for i in train_df.index})
train_df['other'] = train_df['other'].fillna({i: [] for i in train_df.index})
train_df

Unnamed: 0,OCR,image,hero,villain,victim,other
0,Bernie or Elizabeth? Be informed.Compare them ...,covid_memes_18.png,[],[],[],"['bernie sanders', 'elizabeth warren']"
1,Extending the Brexit deadline until October 31...,covid_memes_19.png,[],['uk government'],[],[]
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,covid_memes_252.png,['thais'],[],[],['hong kong']
3,"So, I order this foce mask to protect ogainst ...",covid_memes_255.png,[],['china'],[],"['face mask', 'made in china', 'coronavirus']"
4,best candidate for JA 2020 joe biden Kamala ha...,covid_memes_20.png,['joe biden'],[],[],"['bernie sanders', 'kamala harris', 'tiktok']"
...,...,...,...,...,...,...
5547,Trump could shoot someone on the Senate floor ...,memes_5039.png,[],['donald trump'],[],"['senate floor', 'republican']"
5548,MANY PEOPLE ASK ME WHY ALL MY SCHOOL RECORDS A...,memes_2635.png,[],[],[],"['school', 'university', 'joe biden']"
5549,my bes friend my mother consclence my therapis...,memes_1384.png,[],[],[],"['msnbc', 'bernie sanders', 'democratic party'..."
5550,THE N-WORD PASS Signed and approved by Beak Ob...,memes_944.png,[],[],[],['barack obama']


In [None]:
train_df['OCR'][0]

'Bernie or Elizabeth? Be informed.Compare them on the issues that matter. Issue: Who makes the dankest memes? '

In [None]:
import re
import nltk.corpus
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from nltk.corpus import stopwords

def sentences_to_nouns(sentence):
    # case normalization
    sentence = sentence.lower()

    # Remove unwanted chracters
    sentence = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", sentence)

    # Remove Stopwords
    stop = stopwords.words('english')
    sentence = " ".join([word for word in sentence.split() if word not in (stop)])

    # Tokenize sentence
    list_tokns = sentence.split()

    # Find nouns from sentence
    pos = nltk.pos_tag(list_tokns)
    is_noun = lambda pos: pos[:2] == 'NN'
    # do the nlp stuff
    nouns = [word for (word,pos) in nltk.pos_tag(list_tokns) if is_noun(pos)]

    return nouns

In [None]:
entities = []
for idx,val in train_df.iterrows():
    entities.append(sentences_to_nouns(val.get('OCR')))

In [None]:
len(entities)

5552

In [None]:
entities[0]

['bernie', 'elizabeth', 'issues', 'issue', 'memes']

In [None]:
########################################
## Below code runs for approx 01:30 hrs
#########################################

"""


import concurrent.futures
from tqdm import tqdm

# Find sentences according to entities
enty_sent_dict = {}

# for entity in enty_sent_dict:
#     enty_sent_dict[entity] = []

def find_enty_sent(entity):
    temp_sent_list = []
    for idx,val in train_df.iterrows():
        sentence = val.get('OCR').lower()
        sentence = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", sentence)
        if re.search(entity,sentence):
            temp_sent_list.append(sentence)
    enty_sent_dict[entity] = temp_sent_list
    return True

def run():
    with concurrent.futures.ThreadPoolExecutor() as executor:
        result = list(tqdm(executor.map(find_enty_sent,unique_entities),total=len(unique_entities)))
    return result
    # for idx,res in enumerate(result):
    #     print(f'{idx} : {res}')
run()


"""

100%|██████████| 10112/10112 [1:14:03<00:00,  2.28it/s]


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,

In [None]:
# enty_sent_dict = json.dumps()

In [None]:
import concurrent.futures
from tqdm import tqdm

In [None]:
sentences = []

for idx,val in train_df.iterrows():
    text = val.get('OCR').lower()
    text = text.split(".")

    print(text)

    sentences.append(text)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
["you'll be safe from the coronavirus when you socially distance yourself by living in a van down by the river! "]
["covid-19 doesn't affect the youngs you're not young anymore boredpanda", 'com ']
["os gonna catch chlamydia most of y'all before you catch coronavirus", ' sn', ' ']
["you still believe what your government tells you? that's just adorable "]
['how the media creates panic behind camera in front of camera notice anything different? ']
['if it works, it works', ' cough fiher ']
["i didn't mention you said 14 april which year "]
['2019 rain to ipl 2020 corona to ipl ']
["con irea of odarantine ian wwife's idea or duaranmre "]
['last five minutes ']
['literally anything that the government or mainstream media says about anything me: ']
['"vaccine comes out breaki new trump has nothing to do with this vaccine coming so quicklyi "people die from the vaccine breaki new this failed vaccine is all trump\'s fault and h

In [None]:
len(sentences)

5552

In [None]:
print(sentences[0])

['bernie or elizabeth? be informed', 'compare them on the issues that matter', ' issue: who makes the dankest memes? ']


In [None]:
print(sentences[5551])

['biden-obama memes 300', ' funny & hillarious memes of 2016 john robinson ']


In [None]:
for i,entity in enumerate(entities):
  print(i)
  print(entity)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3052
['cmon', 'gotta', 'print', 'birth', 'certificate', 'office', 'desk', 'joe', 'retweets', 'pm', 'nov']
3053
['conventions', 'positivity', 'nothing', 'buthate', 'negativity', 'trump', 'vote', 'country', 'com']
3054
['policy', 'mess', 'states', 'emaflin', 'com']
3055
['bidenthoughts', 'oranges', 'lemons', 'yellows']
3056
['party', 'scumand', 'villainy', 'makeameme', 'org']
3057
['party', 'candidate', 'dr', 'jill', 'interview', 'jill', 'gallery', 'meme']
3058
['guns', 'people', 'sense', 'uno']
3059
['greenparty', 'see', 'imgfip', 'com']
3060
['smell', 'desperation', 'dasnos', 'party', 'fragrance', 'democrats']
3061
['party', 'gay', 'rights', 'andlet', 'yup', 'makeamemuore']
3062
['dog', 'president']
3063
['greta', 'thunberg', 'planet', 'party', 'plane', 'celebrity', 'planet', 'trees', 'planet', 'ifunny', 'co']
3064
['retroc', 'biden', 'avenger', 'joe', 'biden']
3065
['party', 'healthcare', 'people']
3066
['cnn', 'news', '

In [None]:
enty_sent_dict = {}

for i,entity in enumerate(entities[0:10]):
  print(i)
  per_meme_dict = {}
  for entity_per_meme in entity:
      #print("entity = "+ entity_per_meme)
      temp_sent_list = []
      for sentence in sentences[i]:
        if re.search(entity_per_meme,sentence):
          #print(sentence + ". \n")
          temp_sent_list.append(sentence)

      per_meme_dict[entity_per_meme] = temp_sent_list
  print(per_meme_dict)
  enty_sent_dict[train_df['image'][i]] = per_meme_dict

print("\n Whole dictionary = ")
print(enty_sent_dict)

0
{'bernie': ['bernie or elizabeth? be informed'], 'elizabeth': ['bernie or elizabeth? be informed'], 'issues': ['compare them on the issues that matter'], 'issue': ['compare them on the issues that matter', ' issue: who makes the dankest memes? '], 'memes': [' issue: who makes the dankest memes? ']}
1
{'brexit': ['extending the brexit deadline until october 31st in order to ensure a deal everyone can agree with'], 'deadline': ['extending the brexit deadline until october 31st in order to ensure a deal everyone can agree with', ' using the extension to elect a new prime minister and then take a recess until one month before the deadline imgflip'], 'order': ['extending the brexit deadline until october 31st in order to ensure a deal everyone can agree with'], 'deal': ['extending the brexit deadline until october 31st in order to ensure a deal everyone can agree with'], 'everyone': ['extending the brexit deadline until october 31st in order to ensure a deal everyone can agree with'], 'ag

In [None]:
""" For whole data """


enty_sent_dict = {}

for i,entity in enumerate(entities):
  #print(i)
  per_meme_dict = {}
  for entity_per_meme in entity:
      #print("entity = "+ entity_per_meme)
      temp_sent_list = []
      for sentence in sentences[i]:
        if re.search(entity_per_meme,sentence):
          #print(sentence + ". \n")
          temp_sent_list.append(sentence)

      per_meme_dict[entity_per_meme] = temp_sent_list
  #print(per_meme_dict)
  enty_sent_dict[train_df['image'][i]] = per_meme_dict

#print("\n Whole dictionary = ")
#print(enty_sent_dict)

In [None]:
print("Whole dictionary = ")
print(enty_sent_dict)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
import json

with open(os.path.join(DATASET_PATH,'enty_sent_linking.json'),'w') as f:
    json.dump(enty_sent_dict,f,indent=4)

In [None]:
print("length of dictionary = ", len(enty_sent_dict))

length of dictionary =  5552
