In [10]:
import pickle
import re
import string
import warnings
from collections import Counter, defaultdict

import cache_magic
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from langdetect import detect

pd.set_option('display.max_colwidth',100)
warnings.filterwarnings('ignore')

In [18]:
df = pd.read_excel('PM_MMS_Speech.xlsx',dtype={'title':'string',
                                               'date':'datetime64',
                                               'place':'string',
                                               'url':'string',
                                               'text':'string'})

In [19]:
df.head()

Unnamed: 0,title,date,place,url,text
0,PM's address to the Nation,2012-09-21,New Delhi,https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=1226,"My dear brothers and sisters, 	I am speaking to you tonight to explain the reasons for some ..."
1,PM's remarks at the Victory over Polio Celebrations,2014-02-11,New Delhi,https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=1434,“This is indeed a historic day. It is a day that we have worked for tirelessly and awaited eq...
2,PM’s speech at the Governors’ Conference,2014-02-15,New Delhi,https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=1435,"Following is the text of the Prime Minister, Dr. Manmohan Singh’s address at the Governors’ C..."
3,PM's statement in Rajya Sabha on the Telangana Bill and a special package for the successor stat...,2014-02-20,New Delhi,https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=1436,I have listened very carefully to the views expressed by the Leader of Opposition and all ...
4,PM’s interaction with media outside Parliament House,2014-02-05,New Delhi,https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=1429,"""As you have said, this is probably the last session of Parliament, and it is my sincere appe..."


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1380 entries, 0 to 1379
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   title   1380 non-null   string        
 1   date    1380 non-null   datetime64[ns]
 2   place   1380 non-null   string        
 3   url     1380 non-null   string        
 4   text    1335 non-null   string        
dtypes: datetime64[ns](1), string(4)
memory usage: 54.0 KB


In [None]:
df.dropna(inplace= True)#Drops rows with null values inplace
df.reset_index(inplace= True,drop=True)

In [22]:
small_speech = (df.text.str.len()<500)
small_speech.value_counts()

False    1313
True       22
Name: text, dtype: Int64

In [None]:
#Dropping speeches which are small
df.drop(labels = np.flatnonzero(small_speech),inplace = True)
df.reset_index(inplace= True,drop=True)

In [30]:
#Checking the language of the speeches
df_language = df.text.apply(detect)
df_language.value_counts(dropna=False)

en    1294
hi      19
Name: text, dtype: int64

In [34]:
hindi_indices = np.flatnonzero(df_language == 'hi')

In [39]:
df.text[hindi_indices][:5]

6       
	Following is the speech of the Prime Minister, Dr. Manmohan Singh, delivered in Hindi on the ...
1010    
	Following is the address of the Prime Minister, Dr. Manmohan Singh, delivered in Hindi, at Ra...
1192    
	Following is the address of the Prime Minister, Dr. Manmohan Singh, delivered in Hindi, at th...
1202    
	Following is the address of the Prime Minister, Dr. Manmohan Singh, delivered in Hindi, at th...
1208    
	Following is the address of the Prime Minister, Dr. Manmohan Singh, delivered in Hindi, at th...
Name: text, dtype: string

In [40]:
#Dropping hindi speeches
df.drop(labels = hindi_indices,inplace = True)
df.reset_index(inplace= True,drop=True)

In [41]:
#removing unicode characters and extra spaces
def remove_unicode(text):
    return text.encode('ascii', 'ignore').decode('utf8')

def remove_tabs(text):
    return re.sub(r'[\r\n\t]', ' ', text)

In [45]:
df['clean_text'] = df.text.apply(remove_unicode).apply(remove_tabs)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1294 entries, 0 to 1293
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       1294 non-null   string        
 1   date        1294 non-null   datetime64[ns]
 2   place       1294 non-null   string        
 3   url         1294 non-null   string        
 4   text        1294 non-null   string        
 5   clean_text  1294 non-null   object        
dtypes: datetime64[ns](1), object(1), string(4)
memory usage: 60.8+ KB


In [71]:
#Concatenating all speeches to form a single string corpus_text
corpus_text = df.clean_text.str.cat(sep=' ')

In [49]:
#Writing corpus_text to a file
with open("corpus_raw2.txt", "w") as file:  
    file.write(corpus_text) 

In [53]:
#Function to view dictionary items
def view_dict(dictionary,num):
    for key in list(dictionary.keys())[:num]:
        value = dictionary[key]
        print (key,value)

In [55]:
#Loading Embeddings - Code taken from kaggle notebook
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

glove = "glove.840B.300d/glove.840B.300d.txt"

embed_glove = load_embed(glove)

In [72]:
print(list(embed_glove.keys())[:10])

[',', '.', 'the', 'and', 'to', 'of', 'a', 'in', '"', ':']


In [82]:
words = ['cant', "can't",'ca','nt',"n't" ,"'s",'her.','her','Her','##',' ', '\n','.', ',' ,'#.#']
print ([word in embed_glove for word in words])

[True, True, True, True, True, True, True, True, True, True, False, False, True, True, False]


In [74]:
#Code taken from kaggle notebook
#Code to check how much of vocabulary and corpus is covered by the words in glove
#returns words which are not in glove vocab in the decraesing order of their occurence count
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    num_known_words = 0
    num_unknown_words = 0
    for word in vocab.keys():
        try:
            #if a vocabulary word is in glove, then adding that word to known_words and increasing num_known_words count by 1
            known_words[word] = embeddings_index[word]
            num_known_words += vocab[word]
        except:
            #if a vocabulary word is not in glove, then adding that word to unknown_words
            # and increasing num_unknown_words count by 1
            unknown_words[word] = vocab[word]
            num_unknown_words += vocab[word]
            

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(num_known_words / (num_known_words + num_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=lambda kv: kv[1], reverse=True)

    return unknown_words

In [50]:
#Splitting corpus text into words. Using string split as tokenizer
#Note from future self - Using string split is a blunder. Could've used any tokenizer
#But this blunder helped you in learning regex
corpus = corpus_text.split()
print(corpus[:10])
len(corpus)

['My', 'dear', 'brothers', 'and', 'sisters,', 'I', 'am', 'speaking', 'to', 'you', 'tonight', 'to', 'explain', 'the', 'reasons', 'for', 'some', 'important', 'economic', 'policy', 'decisions', 'the', 'government', 'has', 'recently', 'taken.', 'Some', 'political', 'parties', 'have', 'opposed', 'them.', 'You', 'have', 'a', 'right', 'to', 'know', 'the', 'truth', 'about', 'why', 'we', 'have', 'taken', 'these', 'decisions.', 'government', 'likes', 'to', 'impose', 'burdens', 'on', 'the', 'common', 'man.', 'Our', 'Government', 'has', 'been', 'voted', 'to', 'office', 'twice', 'to', 'protect', 'the', 'interests', 'of', 'the', 'aam', 'admi.', 'At', 'the', 'same', 'time,', 'it', 'is', 'the', 'responsibility', 'of', 'the', 'government', 'to', 'defend', 'the', 'national', 'interest,', 'and', 'protect', 'the', 'long', 'term', 'future', 'of', 'our', 'people.', 'This', 'means', 'that']


1511539

In [75]:
#creating a counter for words and from it vocab
word_count = Counter(corpus)
vocab = dict(word_count)
print(word_count.most_common(10))

[('the', 83156), ('of', 66132), ('and', 54129), ('to', 50481), ('in', 36860), ('a', 26523), ('our', 21120), ('that', 20152), ('is', 19145), ('I', 16399)]


In [66]:
#I think this is a long tail distribution
#All the operations performed below are intended to improve the coverage and reduce the number of unk tokens while feeding model
oov = check_coverage(vocab,embed_glove)
oov[:10]

Found embeddings for 55.18% of vocab
Found embeddings for  93.56% of all text


[("India's", 1016),
 ('However,', 673),
 ('therefore,', 621),
 ('India,', 527),
 ('years,', 463),
 ('sector.', 405),
 ('"I', 392),
 ('time,', 388),
 ('country,', 323),
 ('Today,', 311)]

In [67]:
re.findall(r'"[A-Za-z]+',corpus_text[:100000])

['"No',
 '"Yes',
 '"Let',
 '"As',
 '"I',
 '"This',
 '"Madam',
 '"I',
 '"Who',
 '"I',
 '"It',
 '"the',
 '"Social',
 '"I']

In [116]:
#Separating  quotations
corpus_text_clean = re.sub(r'(")([A-Za-z0-9])', r'\1 \2', corpus_text)#new numbers added
corpus_text_clean = re.sub(r'([A-Za-z0-9])(")', r'\1 \2', corpus_text_clean)#new entire line
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:10]

Found embeddings for 56.73% of vocab
Found embeddings for  93.71% of all text


[("India's", 1018),
 ('However,', 673),
 ('therefore,', 621),
 ('India,', 527),
 ('years,', 463),
 ('".', 416),
 ('sector.', 405),
 ('time,', 388),
 ('country,', 323),
 ('Today,', 311)]

In [117]:
#Separating punctuation maarks from words
corpus_text_clean = re.sub(r'([A-Za-z]+)([\.,\?])',r'\1 \2',corpus_text_clean)
corpus_text_clean = re.sub(r'([.,])([A-Za-z]+)',r'\1 \2',corpus_text_clean)#added new
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 83.32% of vocab
Found embeddings for  99.16% of all text


[("India's", 1032),
 ('."', 712),
 ('".', 427),
 ("world's", 141),
 ('",', 127),
 ("country's", 126),
 ("Hon'ble", 121),
 ("today's", 114),
 ("nation's", 104),
 ("people's", 101),
 ("Government's", 86),
 ('Indiraji', 64),
 ("other's", 62),
 ("Gandhiji's", 59),
 ("year's", 58),
 ("one's", 55),
 ('Rajivji', 52),
 ("Minister's", 52),
 ('8%', 51),
 ('9%', 50)]

In [118]:
#Separting 's from the word
corpus_text_clean = re.sub(r'([a-z]+)(\'s)',r'\1 \2',corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 84.41% of vocab
Found embeddings for  99.37% of all text


[('."', 712),
 ('".', 427),
 ('",', 127),
 ("Hon'ble", 121),
 ('Indiraji', 78),
 ('Rajivji', 65),
 ('8%', 51),
 ('9%', 50),
 ('2004,', 42),
 ('10%', 38),
 ('50%', 35),
 ('2008,', 33),
 ('7%', 33),
 ('BIMST-EC', 32),
 ('.,', 31),
 ('1991,', 30),
 ('2%', 30),
 ('2005,', 29),
 ('2009,', 29),
 ('2006,', 27)]

In [119]:
re.findall(r'[0-9]+[.,][0-9]?',corpus_text[:100000])

['200,0',
 '1991.',
 '1991,',
 '8.2',
 '1991.',
 '2.3',
 '5.',
 '14,',
 '2013,',
 '2013.',
 '2005.',
 '15.',
 '1814,',
 '1814,',
 '2008,',
 '2004,',
 '2014.',
 '14,',
 '1946,',
 '2006,',
 '2012,',
 '2007,',
 '2008,',
 '1.',
 '2.',
 '3.',
 '4.',
 '5.',
 '6.',
 '2004.',
 '7.',
 '8.',
 '9.',
 '10.',
 '11.',
 '12.',
 '13.',
 '2004,',
 '14.',
 '15.',
 '16.',
 '17.']

In [120]:
re.findall(r'[0-9]+[\S]+',corpus_text[:10000])

['80%',
 '40',
 '200,000',
 '1991.',
 '1991',
 '17',
 '160',
 '50%',
 '1991,',
 '8.2',
 '1991.',
 '2.3']

In [121]:
#removing numbers
#In hindsight - might be a bad choice. Might have replaced them with hashes
corpus_text_clean = re.sub(r'[0-9]+[\S]+',' ',corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 86.16% of vocab
Found embeddings for  99.51% of all text


[('."', 712),
 ('".', 427),
 ('",', 127),
 ("Hon'ble", 121),
 ('G-', 90),
 ('Indiraji', 78),
 ('Rajivji', 65),
 ('BIMST-EC', 32),
 ('.,', 31),
 ('Hind!', 25),
 (".'", 24),
 ('?"', 20),
 ('Vidyutikaran', 19),
 ('mid-', 17),
 ('Sharadji', 16),
 ('Nehruji', 16),
 ('Kantji', 15),
 ('Deputy-Speaker', 15),
 ('India`s', 15),
 ("farmers'", 14)]

In [122]:
#Separating punctuation marks from each other
#[word for word,_ in oov if]
corpus_text_clean = re.sub('([\.\,\:\;\"])([\.\,\:\;\"])',r'\1 \2',corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 86.24% of vocab
Found embeddings for  99.58% of all text


[('..', 137),
 ("Hon'ble", 121),
 ('G-', 90),
 ('Indiraji', 78),
 ('Rajivji', 65),
 ('BIMST-EC', 32),
 ('Hind!', 25),
 (".'", 24),
 ('?"', 20),
 ('Vidyutikaran', 19),
 ('mid-', 17),
 ('Sharadji', 16),
 ('Nehruji', 16),
 ('Kantji', 15),
 ('Deputy-Speaker', 15),
 ('India`s', 15),
 ('quote:', 14),
 ("farmers'", 14),
 ('to:', 14),
 ('are:', 13)]

In [123]:
#Separating ji from the name
corpus_text_clean = re.sub(r'([A-Z][a-z]+)ji', r'\1',corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 86.53% of vocab
Found embeddings for  99.61% of all text


[('..', 137),
 ("Hon'ble", 121),
 ('G-', 90),
 ('BIMST-EC', 32),
 ('Hind!', 25),
 (".'", 24),
 ('?"', 20),
 ('Vidyutikaran', 19),
 ('mid-', 17),
 ('Deputy-Speaker', 15),
 ('India`s', 15),
 ('quote:', 14),
 ("farmers'", 14),
 ('to:', 14),
 ('are:', 13),
 ("'Sarva", 13),
 ("peoples'", 12),
 ('people;', 12),
 ("'Look", 11),
 ('development;', 10)]

In [125]:
corpus_text_clean = re.sub(r'([A-Za-z0-9]+)([:;?!/)/(//])', r'\1 \2 ', corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 89.95% of vocab
Found embeddings for  99.71% of all text


[('..', 137),
 ("Hon'ble", 121),
 ('G-', 90),
 ('BIMST-EC', 32),
 ('(Interruptions', 31),
 (".'", 24),
 ('?"', 20),
 ('Vidyutikaran', 19),
 ('mid-', 17),
 ('Deputy-Speaker', 15),
 ('India`s', 15),
 ("farmers'", 14),
 ("'Sarva", 13),
 ("peoples'", 12),
 ("'Look", 11),
 ('(a', 10),
 ('(i', 10),
 ('(ii', 10),
 ('(Prevention', 10),
 ('Hembhai', 10)]

In [None]:
corpus_text_clean = re.sub(r'([A-Za-z0-9]+)([:;?!/)/(//])', r'\1 \2 ', corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

In [134]:
#Separating the brace
corpus_text_clean = re.sub(r'(\()', r' \1  ', corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 90.92% of vocab
Found embeddings for  99.74% of all text


[('..', 138),
 ("Hon'ble", 121),
 ('G-', 91),
 ('BIMST-EC', 32),
 (".'", 24),
 ('?"', 20),
 ('Vidyutikaran', 19),
 ('mid-', 17),
 ('Deputy-Speaker', 15),
 ('India`s', 15),
 ("farmers'", 14),
 ("'Sarva", 13),
 ("peoples'", 12),
 ('IISERs', 12),
 ("'Look", 11),
 ('Hembhai', 10),
 ('Jagvan', 9),
 ("workers'", 9),
 ('NH-', 9),
 ('Mahalonobis', 9)]

In [136]:
#Removing serial dots
corpus_text_clean = re.sub(r'\.\.', r' ', corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 90.92% of vocab
Found embeddings for  99.75% of all text


[("Hon'ble", 121),
 ('G-', 91),
 ('BIMST-EC', 32),
 (".'", 24),
 ('?"', 20),
 ('Vidyutikaran', 19),
 ('mid-', 17),
 ('Deputy-Speaker', 15),
 ('India`s', 15),
 ("farmers'", 14),
 ("'Sarva", 13),
 ("peoples'", 12),
 ('IISERs', 12),
 ("'Look", 11),
 ('Hembhai', 10),
 ('Jagvan', 9),
 ("workers'", 9),
 ('NH-', 9),
 ('Mahalonobis', 9),
 ('post-', 9)]

In [138]:
#Correcting the abbreviations
corpus_text_clean = re.sub("Hon'ble",'Honourable' ,corpus_text_clean)
corpus_text_clean = re.sub('BIMST-EC','BIMSTEC' ,corpus_text_clean)
corpus_text_clean = re.sub(r'A\s?S\s?E\s?A\s?N','ASEAN',corpus_text_clean)
corpus_clean = corpus_text_clean.split()
word_count = Counter(corpus_clean)
vocab_clean = dict(word_count)
oov = check_coverage(vocab_clean,embed_glove)
oov[:20]

Found embeddings for 90.93% of vocab
Found embeddings for  99.76% of all text


[('G-', 91),
 (".'", 24),
 ('?"', 20),
 ('Vidyutikaran', 19),
 ('mid-', 17),
 ('Deputy-Speaker', 15),
 ('India`s', 15),
 ("farmers'", 14),
 ("'Sarva", 13),
 ("peoples'", 12),
 ('IISERs', 12),
 ("'Look", 11),
 ('Hembhai', 10),
 ('Jagvan', 9),
 ("workers'", 9),
 ('NH-', 9),
 ('Mahalonobis', 9),
 ('post-', 9),
 ('Ela-ben', 9),
 ("India'.", 8)]

In [140]:
#Writing cleaned corpus to a file 
with open("corpus_clean_2.txt", "w") as file:  
    file.write(corpus_text_clean) 

In [142]:
vocab_sorted = sorted(vocab_clean.items(), key=lambda kv: kv[1], reverse=True)

In [143]:
#Now we want to restrict the vocab size or else model becomes huge
#In a way we could have done this earlier instead of improving the coverage, then we wouldn't have learnt regex
# If we restrict the vocab words to 1k,5k etc...
# We want to know how much fraction of restricted vocab words are present in glove and
# how much fraction of corpus is covered by the words in restricted vocab
corpus_len = len(corpus_clean)
for i in [1000,5000,10000,15000,20000]:
    vocab_ = dict(vocab_sorted[:i])
    #view_dict(vocab_,5)
    corpus_covered = 0
    vocab_covered = 0
    for (word,count) in vocab_.items():
        if word in embed_glove:
            corpus_covered += count
            vocab_covered += 1
    vocab_coverage = round((vocab_covered/len(vocab_)),2)*100
    corpus_coverage = round((corpus_covered/corpus_len),2)*100
    print (f'For vocab of size {i}, vocabulary coverage is {vocab_coverage}% and corpus coverage is {corpus_coverage}%')

For vocab of size 1000, vocabulary coverage is 100.0% and corpus coverage is 79.0%
For vocab of size 5000, vocabulary coverage is 100.0% and corpus coverage is 94.0%
For vocab of size 10000, vocabulary coverage is 100.0% and corpus coverage is 98.0%
For vocab of size 15000, vocabulary coverage is 98.0% and corpus coverage is 99.0%
For vocab of size 20000, vocabulary coverage is 96.0% and corpus coverage is 99.0%


## Here I made a silly mistake, the 100% vocabulary coverage is misleading because it is a rounded off value. It is not exactly 100%. This created so many problems later becuase if you observe below, embed_5k has a size of 4998 but we are thinking its size is 5000. So, there is an offset of 2, which lead to feeding wrong word vectors to the model

In [149]:
def create_embed(vocab, embeddings_index):
    our_embeddings = {}
    
    for word in vocab.keys():
        try:
            our_embeddings[word] = embeddings_index[word]
        except:
            pass

    return our_embeddings

In [159]:
vocab_5k = dict(vocab_sorted[:5000])
embed_5k = create_embed(vocab_5k, embed_glove)

In [160]:
len(vocab_5k),len(embed_5k)

(5000, 4998)

In [161]:
# Removing missing keys from vocab_5k
missing_keys = list(set(vocab_5k.keys()) - set(embed_5k.keys()))
missing_keys

[".'", 'G-']

In [None]:
for word in missing_keys:
    del vocab_5k[word]

In [166]:
set(vocab_5k.keys()) - set(embed_5k.keys()), len(vocab_5k),len(embed_5k)

(set(), 4998, 4998)

In [167]:
#Pickling vocab and embeddings files
with open('embed_5k_2.pickle', 'wb') as handle:
    pickle.dump(embed_5k, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('vocab_5k_2.pickle', 'wb') as handle:
    pickle.dump(vocab_5k, handle, protocol=pickle.HIGHEST_PROTOCOL)