<a href="https://colab.research.google.com/github/dohyun1411/Quora-Insincere-Questions-Classification/blob/preprocessing1/embedding_with_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

references:

https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

https://www.kaggle.com/alhalimi/tokenization-and-word-embedding-compatibility

https://www.kaggle.com/canming/ensemble-mean-iii-64-36

In [1]:
import gc

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

import tensorflow as tf

  from pandas import Panel


In [2]:
import time
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [3]:
import os

# input_path = '/kaggle/input/quora-insincere-questions-classification'
input_path = '/content/drive/MyDrive/ColabNotebooks/QIQC/data'
train_path = os.path.join(input_path, 'train.csv')
test_path = os.path.join(input_path, 'test.csv')
embeddings_path = os.path.join(input_path, 'embeddings.zip')

In [4]:
if os.path.exists(train_path): train_df = pd.read_csv(train_path)
else: train_df = pd.read_csv(train_path + '.zip')

if os.path.exists(test_path): test_df = pd.read_csv(train_path)
else: test_df = pd.read_csv(train_path + '.zip')

print('train shape:', train_df.shape)
print(train_df.head())
print()
print('test shape:', test_df.shape)
print(test_df.head())

train shape: (1306122, 3)
                    qid  ... target
0  00002165364db923c7e6  ...      0
1  000032939017120e6e44  ...      0
2  0000412ca6e4628ce2cf  ...      0
3  000042bf85aa498cd78e  ...      0
4  0000455dfa3e01eae3af  ...      0

[5 rows x 3 columns]

test shape: (1306122, 3)
                    qid  ... target
0  00002165364db923c7e6  ...      0
1  000032939017120e6e44  ...      0
2  0000412ca6e4628ce2cf  ...      0
3  000042bf85aa498cd78e  ...      0
4  0000455dfa3e01eae3af  ...      0

[5 rows x 3 columns]


In [5]:
# split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

# some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

# fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

# Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [6]:
%%time
# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

CPU times: user 51.3 s, sys: 492 ms, total: 51.8 s
Wall time: 51.7 s


In [7]:
%%time
# Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

CPU times: user 10.2 s, sys: 297 ms, total: 10.5 s
Wall time: 10.4 s


In [8]:
%%time
import zipfile
from gensim.models import KeyedVectors

glove = 'glove.840B.300d/glove.840B.300d.txt'
wiki = 'wiki-news-300d-1M/wiki-news-300d-1M.vec'
google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

with zipfile.ZipFile(embeddings_path) as embeddings_zip:
    print("Found embeddings as a zip file")
    glove_embedding = dict(get_coefs(*o.decode('utf-8').split(" ")) for o in embeddings_zip.open(glove))
    wiki_embedding = dict(get_coefs(*o.decode('utf-8').split(" ")) for o in embeddings_zip.open(wiki))
    google_embedding = KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

Found embeddings as a zip file
CPU times: user 6min 13s, sys: 8.22 s, total: 6min 21s
Wall time: 6min 22s


In [9]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

In [22]:
def get_emb_stats(embeddings_index):

    # Put all embeddings in a numpy matrix
    all_embs= np.stack(embeddings_index.values())

    # Get embedding stats
    emb_mean = all_embs.mean()
    emb_std = all_embs.std()
    
    num_embs = all_embs.shape[0]
    
    emb_size = all_embs.shape[1]
    
    return emb_mean,emb_std, num_embs, emb_size 

In [23]:
# get_emb_stats(glove_embedding) # The results will be constant so we will not run this function in practice to save time

  exec(code_obj, self.user_global_ns, self.user_ns)


(-0.005838499, 0.48782197, 2196016, 300)

In [49]:
glove_emb_mean, glove_emb_std = -0.005838499, 0.48782197
glove_embedding_matrix = np.random.normal(glove_emb_mean, glove_emb_std, (max_features, embed_size))
glove_embedding_matrix.shape

(50000, 300)

In [36]:
for k, v in wiki_embedding.items():
    if v.shape[0] != 300:
        print(k)
        print(v)
        print(v.shape)

999994
[300.]
(1,)


What does 999994 means? And why it only have one element, not 300?

In [45]:
del wiki_embedding['999994']
gc.collect()

219

In [46]:
# get_emb_stats(wiki_embedding)

  exec(code_obj, self.user_global_ns, self.user_ns)


(-0.0033469985, 0.109855495, 999994, 300)

In [51]:
wiki_emb_mean, wiki_emb_std = -0.0033469985, 0.109855495
wiki_embedding_matrix = np.random.normal(wiki_emb_mean, wiki_emb_std, (max_features, embed_size))
wiki_embedding_matrix.shape

(50000, 300)

In [61]:
def get_emb_stats(embeddings_index):

    # Put all embeddings in a numpy matrix
    all_embs= embeddings_index.vectors

    # Get embedding stats
    emb_mean = all_embs.mean()
    emb_std = all_embs.std()
    
    num_embs = all_embs.shape[0]
    
    emb_size = all_embs.shape[1]
    
    return emb_mean,emb_std, num_embs, emb_size 

In [62]:
# get_emb_stats(google_embedding)

(-0.003527845, 0.13315111, 3000000, 300)

In [66]:
google_emb_mean, google_emb_std = -0.003527845, 0.13315111
google_embedding_matrix = np.random.normal(google_emb_mean, google_emb_std, (max_features, embed_size))
google_embedding_matrix.shape

(50000, 300)

In [67]:
glove_oov = {}
wiki_oov = {}
google_oov = {}
for word, i in tqdm(word_index.items()):
    if i >= max_features: continue
    if word in glove_embedding:
        embedding_vector = glove_embedding[word]
        glove_embedding_matrix[i] = embedding_vector
    else:
        glove_oov[word] = i
    if word in wiki_embedding:
        embedding_vector = wiki_embedding[word]
        wiki_embedding_matrix[i] = embedding_vector
    else:
        wiki_oov[word] = i
    if word in google_embedding:
        embedding_vector = google_embedding[word]
        google_embedding_matrix[i] = embedding_vector
    else:
        google_oov[word] = i

HBox(children=(FloatProgress(value=0.0, max=209286.0), HTML(value='')))




In [74]:
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / len(word_index) * 100))
print('percentage of oov of wiki: {:.2f}%'.format(len(wiki_oov) / len(word_index) * 100))
print('percentage of oov of google: {:.2f}%'.format(len(google_oov) / len(word_index) * 100))

percentage of oov of glove: 1.93%
percentage of oov of wiki: 3.47%
percentage of oov of google: 5.64%
