<a href="https://colab.research.google.com/github/dohyun1411/Quora-Insincere-Questions-Classification/blob/preprocessing1/embedding_with_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

references:

https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

https://www.kaggle.com/alhalimi/tokenization-and-word-embedding-compatibility

https://www.kaggle.com/canming/ensemble-mean-iii-64-36

In [1]:
import gc

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

  from pandas import Panel


In [4]:
import os

# input_path = '/kaggle/input/quora-insincere-questions-classification'
input_path = '/content/drive/MyDrive/ColabNotebooks/QIQC/data'
train_path = os.path.join(input_path, 'train.csv')
test_path = os.path.join(input_path, 'test.csv')
embeddings_path = os.path.join(input_path, 'embeddings.zip')

In [6]:
if os.path.exists(train_path): train_df = pd.read_csv(train_path)
else: train_df = pd.read_csv(train_path + '.zip')

if os.path.exists(test_path): test_df = pd.read_csv(train_path)
else: test_df = pd.read_csv(test_path + '.zip')

print('train shape:', train_df.shape)
print('test shape:', test_df.shape)

train shape: (1306122, 3)
test shape: (1306122, 3)


In [34]:
from sklearn.model_selection import train_test_split

# split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1)

# some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

# fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

# Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [35]:
%%time
from keras.preprocessing.text import Tokenizer

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

CPU times: user 50.3 s, sys: 409 ms, total: 50.7 s
Wall time: 50.6 s


In [36]:
%%time
from keras.preprocessing.sequence import pad_sequences

# Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

CPU times: user 10.7 s, sys: 230 ms, total: 10.9 s
Wall time: 10.7 s


In [106]:
import zipfile
from gensim.models import KeyedVectors

glove = 'glove.840B.300d/glove.840B.300d.txt'
wiki = 'wiki-news-300d-1M/wiki-news-300d-1M.vec'
google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
para = 'paragram_300_sl999/paragram_300_sl999.txt'


def load_embedding(embedding_name):

    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

    with zipfile.ZipFile(embeddings_path) as embeddings_zip:
        print("Found embeddings as a zip file")

        if embedding_name == google:
            return KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

        else:
            embedding = []
            for o in embeddings_zip.open(embedding_name):
                try:
                    if len(o.decode('utf-8')) > 100:
                        embedding.append(get_coefs(*o.decode('utf-8').split(" ")))
                except:
                    pass
        
        return dict(embedding)

In [107]:
%%time
glove_embedding = load_embedding(glove) # 3min 21s in Colab

Found embeddings as a zip file
CPU times: user 3min 18s, sys: 6.65 s, total: 3min 24s
Wall time: 3min 24s


In [108]:
%%time
wiki_embedding = load_embedding(wiki) # 1min 20s in Colab

Found embeddings as a zip file
CPU times: user 1min 20s, sys: 1.32 s, total: 1min 21s
Wall time: 1min 21s


In [109]:
%%time
google_embedding = load_embedding(google) # 1min 50s in Colab

Found embeddings as a zip file
CPU times: user 1min 47s, sys: 3.52 s, total: 1min 50s
Wall time: 1min 50s


In [110]:
%%time
para_embedding = load_embedding(para) # 2min 40s in Colab

Found embeddings as a zip file
CPU times: user 2min 37s, sys: 2.77 s, total: 2min 40s
Wall time: 2min 40s


In [111]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

In [121]:
has_get_emb_stats = True # set False if we have not get emb stats
if not has_get_emb_stats:

    def get_emb_stats(embedding):

        # Put all embeddings in a numpy matrix
        try:
            all_embs= np.stack(embedding.values())
        except:
            all_embs = embedding.vectors

        # Get embedding stats
        emb_mean = all_embs.mean()
        emb_std = all_embs.std()
        
        return emb_mean,emb_std
    

    glove_emb_mean, glove_emb_std = get_emb_stats(glove_embedding)
    wiki_emb_mean, wiki_emb_std = get_emb_stats(wiki_embedding)
    google_emb_mean, google_emb_std = get_emb_stats(google_embedding)
    para_emb_mean, para_emb_std = get_emb_stats(para_embedding)

    print('glove emb mean:', glove_emb_mean, 'emb_std:', glove_emb_std)
    print('wiki emb mean:', wiki_emb_mean, 'emb_std:', wiki_emb_std)
    print('google emb mean:', google_emb_mean, 'emb_std:', google_emb_std)
    print('para emb mean:', para_emb_mean, 'emb_std:', para_emb_std)

else:
    glove_emb_mean, glove_emb_std = -0.005838499, 0.48782197
    wiki_emb_mean, wiki_emb_std = -0.0033469985, 0.109855495
    google_emb_mean, google_emb_std = -0.003527845, 0.13315111
    para_emb_mean, para_emb_std = -0.0053248387, 0.49346521

In [123]:
glove_embedding_matrix = np.random.normal(glove_emb_mean, glove_emb_std, (max_features, embed_size))
wiki_embedding_matrix = np.random.normal(wiki_emb_mean, wiki_emb_std, (max_features, embed_size))
google_embedding_matrix = np.random.normal(google_emb_mean, google_emb_std, (max_features, embed_size))
para_embedding_matrix = np.random.normal(para_emb_mean, para_emb_std, (max_features, embed_size))

In [124]:
glove_oov = {}
wiki_oov = {}
google_oov = {}
para_oov = {}
for word, i in tqdm(word_index.items()):
    if i >= max_features: continue
    if word in glove_embedding:
        embedding_vector = glove_embedding[word]
        glove_embedding_matrix[i] = embedding_vector
    else:
        glove_oov[word] = i
    if word in wiki_embedding:
        embedding_vector = wiki_embedding[word]
        wiki_embedding_matrix[i] = embedding_vector
    else:
        wiki_oov[word] = i
    if word in google_embedding:
        embedding_vector = google_embedding[word]
        google_embedding_matrix[i] = embedding_vector
    else:
        google_oov[word] = i
    if word in para_embedding:
        embedding_vector = para_embedding[word]
        para_embedding_matrix[i] = embedding_vector
    else:
        para_oov[word] = i

HBox(children=(FloatProgress(value=0.0, max=197701.0), HTML(value='')))




In [126]:
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / len(word_index) * 100))
print('percentage of oov of wiki: {:.2f}%'.format(len(wiki_oov) / len(word_index) * 100))
print('percentage of oov of google: {:.2f}%'.format(len(google_oov) / len(word_index) * 100))
print('percentage of oov of para: {:.2f}%'.format(len(para_oov) / len(word_index) * 100))

percentage of oov of glove: 2.06%
percentage of oov of wiki: 3.71%
percentage of oov of google: 5.98%
percentage of oov of para: 1.43%
