In [1]:
!python3 --version

Python 3.8.10


In [2]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly as plty
import cufflinks as cf

from plotly.offline import iplot

plty.offline.init_notebook_mode(connected=True)
cf.go_offline()

%matplotlib inline

In [3]:
# Dataset Paths
DATASET_PATH = Path('/mnt/DATA/fharookshaik/major_project/dataset')
TRAIN_IMAGES_DIR_PATH = os.path.join(DATASET_PATH,'Train','images')

TRAIN_CSV_PATH = os.path.join(DATASET_PATH,'Train','train.csv')
VALIDATE_CSV_PATH = os.path.join(DATASET_PATH,'Train','val.csv')

TEST_IMAGES_DIR_PATH = os.path.join(DATASET_PATH,'Test','images')
TEST_CSV_PATH = os.path.join(DATASET_PATH,'Test','test.csv')

Analysing Train Data

In [4]:
train_df = pd.read_csv(TRAIN_CSV_PATH)
train_df.head()

Unnamed: 0,OCR,image,hero,villain,victim,other
0,Bernie or Elizabeth? Be informed.Compare them ...,covid_memes_18.png,,,,"['bernie sanders', 'elizabeth warren']"
1,Extending the Brexit deadline until October 31...,covid_memes_19.png,,['uk government'],,
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,covid_memes_252.png,['thais'],,,['hong kong']
3,"So, I order this face mask to protect against ...",covid_memes_255.png,,['china'],,"['face mask', 'made in china', 'coronavirus']"
4,best candidate for JA 2020 joe biden Kamala ha...,covid_memes_20.png,['joe biden'],,,"['bernie sanders', 'kamala harris', 'tiktok']"


In [5]:
train_df.shape

(5552, 6)

In [6]:
train_df.columns

Index(['OCR', 'image', 'hero', 'villain', 'victim', 'other'], dtype='object')

In [7]:
train_df.drop(labels=['image'],axis=1,inplace=True)
train_df.head()

Unnamed: 0,OCR,hero,villain,victim,other
0,Bernie or Elizabeth? Be informed.Compare them ...,,,,"['bernie sanders', 'elizabeth warren']"
1,Extending the Brexit deadline until October 31...,,['uk government'],,
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,['thais'],,,['hong kong']
3,"So, I order this face mask to protect against ...",,['china'],,"['face mask', 'made in china', 'coronavirus']"
4,best candidate for JA 2020 joe biden Kamala ha...,['joe biden'],,,"['bernie sanders', 'kamala harris', 'tiktok']"


In [8]:
# Finding Num of Empty Values
train_df.isnull().sum()

OCR           5
hero       5159
villain    3668
victim     4794
other       497
dtype: int64

In [9]:
# Dropping the data where OCR = NaN
train_df.dropna(subset=['OCR'],inplace=True)

In [10]:
train_df.isnull().sum()

OCR           0
hero       5154
villain    3663
victim     4789
other       497
dtype: int64

In [11]:
ocr_text = ' '.join(train_df['OCR'].tolist())
ocr_text[:1000]

'Bernie or Elizabeth? Be informed.Compare them on the issues that matter. Issue: Who makes the dankest memes?  Extending the Brexit deadline until October 31st in order to ensure a deal everyone can agree with. Using the extension to elect a new prime minister and then take a recess until one month before the deadline imgflip.com  kwai gkwa 0964 #nnevvy applause to Thais from Hong Kong WHY THAIS DID NOT GET HURT? CULTURAL DIFFERENCE JUST STAY IN THE WALL  So, I order this face mask to protect against the Corona virus Hooold up, it says something here. "Made in China"  best candidate for JA 2020 joe biden Kamala harris Bernie sanders TikTok @eliguthrie  BREAKING NEWS! EL CH AND MANNEQUINS THEN HE BOUGHT LINGERIE FOR NEWS AWOMAN WHO WAS SEEN OUTSIDE THE STORE WEARING A MAN WEARING A DONALD TRUMP MASK WENT INTO A VICTORIA SECRET STORE OLGED FEMALE CUSTOMERS HAN A MEIANIA TRUMP MASKWHO WASWAITING FOR HIM TO COME OUT SAM WE HAVE THE SURVEILLANCE imgfip.com FILM AT 11 Ron Burgundy Meme - Img

In [12]:
# Text Cleaning
contractions = { 
  "ain't": "am not",
  "aren't": "are not",
  "cant":"cannot",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "bc": "because",
  "becos":"because",
  "cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "corp": "corporation",
  "cud":"could",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "execs": "executives",
  "fck": "fuck",
  "fcking": "fucking",
  "gon na": "going to",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "im": "i am",
  "iam": "i am",
  "i'd": "I would",
  "i'd've": "I would have",
  "i'll": "I will",
  "i'll've": "I will have",
  "i'm": "I am",
  "i've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "mgr": "manager",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "ofc": "office",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "pics": "pictures",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "svc":"service",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "tho":"though",
  "to've": "to have",
  "wan na": "want to",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

In [13]:
def cont_to_exp(x):
    if type(x) is str:
        x = x.replace('\\','')
        for key in contractions:
            val = contractions[key]
            x = x.replace(key,val)
        return x
    return x

In [14]:
%%time
train_df['OCR'] = train_df['OCR'].apply(lambda x: cont_to_exp(x))

CPU times: user 383 ms, sys: 0 ns, total: 383 ms
Wall time: 382 ms


In [15]:
train_df.head(10)

Unnamed: 0,OCR,hero,villain,victim,other
0,Bernie or Elizabeth? Be informed.Compare them ...,,,,"['bernie sanders', 'elizabeth warren']"
1,Extending the Brexit deadline until October 31...,,['uk government'],,
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,['thais'],,,['hong kong']
3,"So, I order this face mask to protect against ...",,['china'],,"['face mask', 'made in china', 'coronavirus']"
4,best candidate for JA 2020 joe biden Kamala ha...,['joe biden'],,,"['bernie sanders', 'kamala harris', 'tiktok']"
5,BREAKING NEWS! EL CH AND MANNEQUINS THEN HE BO...,,['donald trump'],,"['', 'victoria secret store', 'melania trump',..."
6,MASK TRUMP TRUMP DEN CE Help your child find t...,['donald trump'],['barack obama'],,['halloween']
7,#Hantavirus China discovered new virus Whole w...,,['china'],"['people', 'world']",['hantavirus']
8,When it is finally 2020 but you realize that b...,,,,"['china', 'plague', 'coronavirus']"
9,nese NewYear Chira's attermpt at controlling a...,,['wuhan coronavirus'],['china'],"['chinese new year', 'coronavirus', 'chinese']"


In [16]:
train_df['ocr_len'] = train_df['OCR'].apply(lambda x:len(x))
train_df

Unnamed: 0,OCR,hero,villain,victim,other,ocr_len
0,Bernie or Elizabeth? Be informed.Compare them ...,,,,"['bernie sanders', 'elizabeth warren']",109
1,Extending the Brexit deadline until October 31...,,['uk government'],,,225
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,['thais'],,,['hong kong'],126
3,"So, I order this face mask to protect against ...",,['china'],,"['face mask', 'made in china', 'coronavirus']",114
4,best candidate for JA 2020 joe biden Kamala ha...,['joe biden'],,,"['bernie sanders', 'kamala harris', 'tiktok']",85
...,...,...,...,...,...,...
5547,Trump could shoot someone on the Senate floor ...,,['donald trump'],,"['senate floor', 'republican']",129
5548,MANY PEOPLE ASK ME WHY ALL MY SCHOOL RECORDS A...,,,,"['school', 'university', 'joe biden']",185
5549,my bes friend my mother consclence my therapis...,,,,"['msnbc', 'bernie sanders', 'democratic party'...",135
5550,THE N-WORD PASS Signed and approved by Beak Ob...,,,,['barack obama'],50


In [17]:
train_df['word_count'] = train_df['OCR'].apply(lambda x:len(x.split()))
train_df

Unnamed: 0,OCR,hero,villain,victim,other,ocr_len,word_count
0,Bernie or Elizabeth? Be informed.Compare them ...,,,,"['bernie sanders', 'elizabeth warren']",109,17
1,Extending the Brexit deadline until October 31...,,['uk government'],,,225,40
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,['thais'],,,['hong kong'],126,23
3,"So, I order this face mask to protect against ...",,['china'],,"['face mask', 'made in china', 'coronavirus']",114,21
4,best candidate for JA 2020 joe biden Kamala ha...,['joe biden'],,,"['bernie sanders', 'kamala harris', 'tiktok']",85,13
...,...,...,...,...,...,...,...
5547,Trump could shoot someone on the Senate floor ...,,['donald trump'],,"['senate floor', 'republican']",129,23
5548,MANY PEOPLE ASK ME WHY ALL MY SCHOOL RECORDS A...,,,,"['school', 'university', 'joe biden']",185,36
5549,my bes friend my mother consclence my therapis...,,,,"['msnbc', 'bernie sanders', 'democratic party'...",135,22
5550,THE N-WORD PASS Signed and approved by Beak Ob...,,,,['barack obama'],50,9


In [18]:
def get_avg_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len += len(word)
    return word_len/len(words)

In [19]:
train_df['avg_word_len'] = train_df['OCR'].apply(lambda x: get_avg_word_len(x))
train_df

Unnamed: 0,OCR,hero,villain,victim,other,ocr_len,word_count,avg_word_len
0,Bernie or Elizabeth? Be informed.Compare them ...,,,,"['bernie sanders', 'elizabeth warren']",109,17,5.411765
1,Extending the Brexit deadline until October 31...,,['uk government'],,,225,40,4.625000
2,kwai gkwa 0964 #nnevvy applause to Thais from ...,['thais'],,,['hong kong'],126,23,4.478261
3,"So, I order this face mask to protect against ...",,['china'],,"['face mask', 'made in china', 'coronavirus']",114,21,4.428571
4,best candidate for JA 2020 joe biden Kamala ha...,['joe biden'],,,"['bernie sanders', 'kamala harris', 'tiktok']",85,13,5.538462
...,...,...,...,...,...,...,...,...
5547,Trump could shoot someone on the Senate floor ...,,['donald trump'],,"['senate floor', 'republican']",129,23,4.608696
5548,MANY PEOPLE ASK ME WHY ALL MY SCHOOL RECORDS A...,,,,"['school', 'university', 'joe biden']",185,36,4.138889
5549,my bes friend my mother consclence my therapis...,,,,"['msnbc', 'bernie sanders', 'democratic party'...",135,22,5.136364
5550,THE N-WORD PASS Signed and approved by Beak Ob...,,,,['barack obama'],50,9,4.555556


In [20]:
train_df['ocr_len'].iplot(kind='hist')

In [21]:
train_df['word_count'].iplot(kind='hist')

In [22]:
!pip3 install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m590.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1315 sha256=05d2b026a959614d76bbf5d397a6d515996602adfd9b6c067e63601916c9f2ee
  Stored in directory: /home/fharookshaik/.cache/pip/wheels/22/0b/40/fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: threadpoolctl, scikit-learn, sklearn
Successfully installed scikit-learn-1.0.2 sklearn-0.0 t

In [23]:
# Distribution of Unigram, Bigram, Trigram
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
def get_top_n_words(x,n):
    vec = CountVectorizer(stop_words='english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis=0)
    word_freq = [(word, sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    word_freq = sorted(word_freq,key=lambda x: x[1], reverse=True)
    return word_freq[:n]

In [27]:
words = get_top_n_words(train_df['OCR'],20)
words

[('party', 956),
 ('trump', 730),
 ('biden', 524),
 ('joe', 464),
 ('obama', 446),
 ('com', 441),
 ('coronavirus', 430),
 ('virus', 420),
 ('home', 398),
 ('people', 390),
 ('like', 369),
 ('covid', 337),
 ('2020', 310),
 ('just', 307),
 ('work', 303),
 ('republican', 299),
 ('corona', 282),
 ('mask', 270),
 ('19', 270),
 ('president', 251)]

In [28]:
df1 = pd.DataFrame(words,columns=['Unigram','Frequency'])
df1 = df1.set_index('Unigram')
df1.iplot(kind='bar')

In [29]:
def get_top_n_words_bigram(x,n):
    vec = CountVectorizer(ngram_range=(2,2),stop_words='english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis=0)
    word_freq = [(word, sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    word_freq = sorted(word_freq,key=lambda x: x[1], reverse=True)
    return word_freq[:n]

In [30]:
bigram_words = get_top_n_words_bigram(train_df['OCR'],20)
bigram_words

[('covid 19', 236),
 ('republican party', 220),
 ('democratic party', 208),
 ('joe biden', 188),
 ('work home', 168),
 ('corona virus', 163),
 ('donald trump', 155),
 ('libertarian party', 155),
 ('ti ame', 125),
 ('green party', 120),
 ('obama joe', 102),
 ('amgflip com', 90),
 ('working home', 80),
 ('wuhan virus', 75),
 ('barack obama', 73),
 ('government work', 64),
 ('presidential debate', 58),
 ('lp lp', 55),
 ('toilet paper', 49),
 ('wear mask', 48)]

In [31]:
df2 = pd.DataFrame(bigram_words,columns=['Bigram','Frequency'])
df2 = df2.set_index('Bigram')
df2.iplot(kind='bar')

In [32]:
def get_top_n_words_trigram(x,n):
    vec = CountVectorizer(ngram_range=(3,3),stop_words='english').fit(x)
    bow = vec.transform(x)
    sum_words = bow.sum(axis=0)
    word_freq = [(word, sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    word_freq = sorted(word_freq,key=lambda x: x[1], reverse=True)
    return word_freq[:n]

In [33]:
trigram_words = get_top_n_words_trigram(train_df['OCR'],20)
trigram_words

[('government work home', 63),
 ('lp lp lp', 49),
 ('post involved locally', 39),
 ('involved locally ip', 39),
 ('locally ip org', 39),
 ('ip org states', 39),
 ('thanks libertarian party', 34),
 ('obama joe biden', 29),
 ('deadly wuhan virus', 24),
 ('experts say deadly', 21),
 ('say deadly wuhan', 21),
 ('virus killed alcohol', 21),
 ('wuhan virus killed', 19),
 ('party republican party', 19),
 ('make america great', 17),
 ('democratic party republican', 17),
 ('20 twitter iphone', 15),
 ('barack obama memes', 15),
 ('ti ame home', 14),
 ('1820 1920 massive', 13)]

In [34]:
df3 = pd.DataFrame(trigram_words,columns=['Trigram','Frequency'])
df3 = df3.set_index('Trigram')
df3.iplot(kind='bar')

In [36]:
# Distribution of Top 20 Parts os Speech POS tags
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     /home/fharookshaik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fharookshaik/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [38]:
blob = TextBlob(str(train_df['OCR']))
blob

TextBlob("0       Bernie or Elizabeth? Be informed.Compare them ...
1       Extending the Brexit deadline until October 31...
2       kwai gkwa 0964 #nnevvy applause to Thais from ...
3       So, I order this face mask to protect against ...
4       best candidate for JA 2020 joe biden Kamala ha...
                              ...                        
5547    Trump could shoot someone on the Senate floor ...
5548    MANY PEOPLE ASK ME WHY ALL MY SCHOOL RECORDS A...
5549    my bes friend my mother consclence my therapis...
5550    THE N-WORD PASS Signed and approved by Beak Ob...
5551    Biden-Obama Memes 300. Funny & Hillarious Meme...
Name: OCR, Length: 5547, dtype: object")

In [39]:
blob.tags

[('0', 'CD'),
 ('Bernie', 'NNP'),
 ('or', 'CC'),
 ('Elizabeth', 'NNP'),
 ('Be', 'NNP'),
 ('informed.Compare', 'VBP'),
 ('them', 'PRP'),
 ('1', 'CD'),
 ('Extending', 'VBG'),
 ('the', 'DT'),
 ('Brexit', 'NNP'),
 ('deadline', 'NN'),
 ('until', 'IN'),
 ('October', 'NNP'),
 ('31', 'CD'),
 ('2', 'CD'),
 ('kwai', 'NN'),
 ('gkwa', 'NN'),
 ('0964', 'CD'),
 ('nnevvy', 'JJ'),
 ('applause', 'NN'),
 ('to', 'TO'),
 ('Thais', 'VB'),
 ('from', 'IN'),
 ('3', 'CD'),
 ('So', 'NNP'),
 ('I', 'PRP'),
 ('order', 'NN'),
 ('this', 'DT'),
 ('face', 'NN'),
 ('mask', 'NN'),
 ('to', 'TO'),
 ('protect', 'VB'),
 ('against', 'IN'),
 ('4', 'CD'),
 ('best', 'JJS'),
 ('candidate', 'NN'),
 ('for', 'IN'),
 ('JA', 'NNP'),
 ('2020', 'CD'),
 ('joe', 'NN'),
 ('biden', 'NN'),
 ('Kamala', 'NNP'),
 ('ha', 'NN'),
 ('5547', 'CD'),
 ('Trump', 'NNP'),
 ('could', 'MD'),
 ('shoot', 'VB'),
 ('someone', 'NN'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('Senate', 'NNP'),
 ('floor', 'NN'),
 ('5548', 'CD'),
 ('MANY', 'JJ'),
 ('PEOPLE', 'NNP'),
 ('AS

In [40]:
pos_df = pd.DataFrame(blob.tags,columns=['words','pos'])
pos_df

Unnamed: 0,words,pos
0,0,CD
1,Bernie,NNP
2,or,CC
3,Elizabeth,NNP
4,Be,NNP
...,...,...
92,OCR,NNP
93,Length,NNP
94,5547,CD
95,dtype,NN


In [41]:
pos_df = pos_df['pos'].value_counts()

In [42]:
pos_df.iplot(kind='bar')