Reference


https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

from google.colab import drive
drive.mount('/gdrive')

root = '/gdrive/My Drive/ml_project'

train = pd.read_csv(root + "/input/train.csv")
test = pd.read_csv(root + "/input/test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

  from pandas import Panel


Mounted at /gdrive
Train shape :  (1306122, 3)
Test shape :  (375806, 2)


Import embeddings from zip

In [2]:
import zipfile
from gensim.models import KeyedVectors
import numpy as np

embeddings_path = root + "/input/embeddings/embeddings.zip"
glove = 'glove.840B.300d/glove.840B.300d.txt'
wiki = 'wiki-news-300d-1M/wiki-news-300d-1M.vec'
google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
para = 'paragram_300_sl999/paragram_300_sl999.txt'


def load_embedding(embedding_name):

    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

    with zipfile.ZipFile(embeddings_path) as embeddings_zip:
        print("Found embeddings as a zip file")

        if embedding_name == google:
            return KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

        else:
            embedding = []
            for o in embeddings_zip.open(embedding_name):
                try:
                    if len(o.decode('utf-8')) > 100:
                        embedding.append(get_coefs(*o.decode('utf-8').split(" ")))
                except:
                    pass
        
        return dict(embedding)

Build vocab and check coverage

In [3]:
import operator 
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

def clean_short(x): #remove one-length word
  shortword = re.compile(r'\W*\b\w\b')
  x = shortword.sub('', x)
  return x

mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                'Snapchat': 'social medium',

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [7]:
train_google = train["question_text"]
train_other = train["question_text"]

train_google = train_google.progress_apply(lambda x: clean_text(x))
train_google = train_google.progress_apply(lambda x: clean_numbers(x))
train_google = train_google.progress_apply(lambda x: replace_typical_misspell(x))
train_google = train_google.progress_apply(lambda x: clean_short(x))
sentences = train_google.progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab_google = build_vocab(sentences)

train_other = train_other.progress_apply(lambda x: clean_text(x))
train_other = train_other.progress_apply(lambda x: replace_typical_misspell(x))
train_other = train_other.progress_apply(lambda x: clean_short(x))
sentences = train_other.progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab_other = build_vocab(sentences)


100%|██████████| 1306122/1306122 [00:11<00:00, 111367.10it/s]
100%|██████████| 1306122/1306122 [00:17<00:00, 75958.25it/s]
100%|██████████| 1306122/1306122 [00:06<00:00, 203975.01it/s]
100%|██████████| 1306122/1306122 [00:10<00:00, 129417.39it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 218110.46it/s]
100%|██████████| 1306122/1306122 [00:07<00:00, 170213.23it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 233863.22it/s]
100%|██████████| 1306122/1306122 [00:11<00:00, 110736.36it/s]
100%|██████████| 1306122/1306122 [00:06<00:00, 205618.96it/s]
100%|██████████| 1306122/1306122 [00:10<00:00, 128172.72it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 229819.80it/s]
100%|██████████| 1306122/1306122 [00:06<00:00, 198603.64it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 223966.33it/s]


In [5]:
embed_size = 300
max_features = 50000
def get_emb_stats(embeddings_index):

    # Put all embeddings in a numpy matrix
    all_embs= np.stack(embeddings_index.values())

    # Get embedding stats
    emb_mean = all_embs.mean()
    emb_std = all_embs.std()
    
    num_embs = all_embs.shape[0]
    
    emb_size = all_embs.shape[1]
    
    return emb_mean,emb_std, num_embs, emb_size



Make 4 embeddings

In [17]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
max_features = min(len(word_index), max_features)

glove_embedding = load_embedding(glove)
oov_glove = check_coverage(vocab_other, glove_embedding)
print(oov_glove[:5],"\n",oov_glove[5:10])
glove_emb_mean,glove_emb_std, glove_num_embs, glove_emb_size = get_emb_stats(glove_embedding)
#glove_emb_matrix = np.random.normal(glove_emb_mean, glove_emb_std, (max_features, embed_size))

del glove_embedding
wiki_embedding = load_embedding(wiki)
oov_wiki = check_coverage(vocab_other, wiki_embedding)
print(oov_wiki[:5],"\n",oov_wiki[5:10])
wiki_emb_mean,wiki_emb_std, wiki_num_embs, wiki_emb_size = get_emb_stats(wiki_embedding)
#wiki_emb_matrix = np.random.normal(wiki_emb_mean, wiki_emb_std, (max_features, embed_size))

del wiki_embedding
google_embedding = load_embedding(google)
oov_google = check_coverage(vocab_google, google_embedding)
print(oov_google[:5],"\n",oov_google[5:10])
#google_emb_mean,google_emb_std, google_num_embs, google_emb_size = get_emb_stats(google_embedding)
#google_emb_matrix = np.random.normal(google_emb_mean, google_emb_std, (max_features, embed_size))

del google_embedding
para_embedding = load_embedding(para)
oov_para = check_coverage(vocab_other, para_embedding)
print(oov_para[:5],"\n",oov_para[5:10])
para_emb_mean,para_emb_std, para_num_embs, para_emb_size = get_emb_stats(para_embedding)
#para_emb_matrix = np.random.normal(para_emb_mean, para_emb_std, (max_features, embed_size))
del para_embedding

Found embeddings as a zip file


100%|██████████| 238716/238716 [00:00<00:00, 454490.87it/s]
  exec(code_obj, self.user_global_ns, self.user_ns)


Found embeddings for 72.38% of vocab
Found embeddings for  99.41% of all text
[('Quorans', 858), ('Brexit', 493), ('cryptocurrencies', 481), ('Redmi', 380), ('₹', 178)] 
 [('OnePlus', 125), ('UCEED', 124), ('GDPR', 107), ('Blockchain', 107), ('demonetisation', 106)]
Found embeddings as a zip file


100%|██████████| 238716/238716 [00:00<00:00, 656729.70it/s]


Found embeddings for 65.34% of vocab
Found embeddings for  99.23% of all text
[('Quorans', 858), ('BITSAT', 564), ('COMEDK', 352), ('KVPY', 349), ('Quoran', 308)] 
 [('mtech', 281), ('WBJEE', 231), ('bcom', 199), ('articleship', 191), ('VITEEE', 182)]
Found embeddings as a zip file


100%|██████████| 238716/238716 [00:00<00:00, 267898.68it/s]


Found embeddings for 60.98% of vocab
Found embeddings for  98.95% of all text
[('bitcoin', 987), ('Quorans', 858), ('cryptocurrency', 822), ('btech', 632), ('Brexit', 493)] 
 [('cryptocurrencies', 481), ('blockchain', 474), ('behaviour', 468), ('upvotes', 432), ('programme', 402)]
Found embeddings as a zip file


100%|██████████| 238716/238716 [00:00<00:00, 449850.26it/s]


Found embeddings for 38.85% of vocab
Found embeddings for  80.88% of all text
[('What', 430843), ('How', 263113), ('Why', 145160), ('Is', 108973), ('Can', 53043)] 
 [('Which', 47352), ('Do', 40148), ('If', 34767), ('India', 32727), ('Are', 29254)]
