<a href="https://colab.research.google.com/github/dohyun1411/Quora-Insincere-Questions-Classification/blob/preprocessing1/embedding_with_tweet_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gc
import random

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

In [None]:
def seed_torch(seed=31):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
seed_torch()
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [None]:
device

device(type='cuda')

In [None]:
import os

# input_path = '/kaggle/input/quora-insincere-questions-classification'
input_path = '/content/drive/MyDrive/ColabNotebooks/QIQC/data'
train_path = os.path.join(input_path, 'train.csv')
test_path = os.path.join(input_path, 'test.csv')
embeddings_path = os.path.join(input_path, 'embeddings.zip')

In [None]:
if os.path.exists(train_path): train_df = pd.read_csv(train_path)
else: train_df = pd.read_csv(train_path + '.zip')

if os.path.exists(test_path): test_df = pd.read_csv(train_path)
else: test_df = pd.read_csv(test_path + '.zip')

print('train shape:', train_df.shape)
print('test shape:', test_df.shape)

train shape: (1306122, 3)
test shape: (1306122, 3)


In [None]:
from sklearn.model_selection import train_test_split

# split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1)

# some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

# fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

# Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer

tweet_tknzr = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True)

In [None]:
import zipfile
from gensim.models import KeyedVectors

glove = 'glove.840B.300d/glove.840B.300d.txt'
wiki = 'wiki-news-300d-1M/wiki-news-300d-1M.vec'
google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
para = 'paragram_300_sl999/paragram_300_sl999.txt'


def _load_embedding(embedding_name):

    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

    with zipfile.ZipFile(embeddings_path) as embeddings_zip:
        print("Found embeddings as a zip file")

        if embedding_name == google:
            return KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

        else:
            embedding = []
            for o in embeddings_zip.open(embedding_name):
                try:
                    if len(o.decode('utf-8')) > 100:
                        embedding.append(get_coefs(*o.decode('utf-8').split(" ")))
                except:
                    pass
        
        return dict(embedding)
  

def load_embeddings(emb_idx=1234):
    """
    


    """

In [None]:
%%time
glove_embedding = load_embedding(glove) # 3min 21s in Colab

Found embeddings as a zip file
CPU times: user 4min 10s, sys: 6.33 s, total: 4min 16s
Wall time: 4min 17s


In [None]:
from collections import defaultdict
import operator

def build_vocab(sentences, verbose=1):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """

    vocab = defaultdict(int)
    for sentence in tqdm(sentences, disable=(not verbose)):
        for word in sentence:
            vocab[word] += 1
    return dict(sorted(vocab.items(), key=operator.itemgetter(1), reverse=True))

In [None]:
build_vocab([['hi', 'ih', 'hi']])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




[('hi', 2), ('ih', 1)]

In [None]:
train_X

array(['What are some supernatural related things in the DCEU and MCU?',
       'What are the best master degrees in computational finance?',
       'Is changing the First Amendment something the Trump administration considers to be a viable strategy to combat "fake news"?',
       ...,
       'What might be some of the reasons for my feet to turn red while running?',
       'Is 22 too old to start learning computer engineering?',
       'Where can you find a template for a personal stock trading journal?'],
      dtype=object)

In [None]:
sentences = train_df["question_text"].fillna("_na_").progress_apply(tweet_tknzr.tokenize).values

HBox(children=(FloatProgress(value=0.0, max=1175509.0), HTML(value='')))




In [None]:
vocab = build_vocab(sentences)

HBox(children=(FloatProgress(value=0.0, max=1175509.0), HTML(value='')))




In [None]:
vocab

defaultdict(int,
            {'What': 378162,
             'are': 192523,
             'some': 48609,
             'supernatural': 161,
             'related': 2745,
             'things': 9713,
             'in': 330063,
             'the': 589611,
             'DCEU': 72,
             'and': 228687,
             'MCU': 401,
             '?': 1242841,
             'best': 56220,
             'master': 940,
             'degrees': 675,
             'computational': 122,
             'finance': 1206,
             'Is': 98108,
             'changing': 999,
             'First': 399,
             'Amendment': 285,
             'something': 7126,
             'Trump': 11213,
             'administration': 664,
             'considers': 88,
             'to': 365411,
             'be': 82281,
             'a': 363985,
             'viable': 250,
             'strategy': 1559,
             'combat': 538,
             '"': 66156,
             'fake': 2002,
             'news': 2109,
         

In [None]:
glove_emb_mean, glove_emb_std = -0.005838499, 0.48782197

In [None]:
# some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

In [None]:
glove_embedding_matrix = np.random.normal(glove_emb_mean, glove_emb_std, (max_features, embed_size))

In [None]:
glove_oov = []
i = 0
for word in tqdm(vocab):
    if i >= max_features: break
    if word.lower() in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[word.lower()]
        i += 1
    else:
        glove_oov.append(word.lower())

print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / max_features * 100))

HBox(children=(FloatProgress(value=0.0, max=257897.0), HTML(value='')))

percentage of oov of glove: 17.99%


In [None]:
glove_oov

['dceu',
 "china's",
 'mahab',
 'alshamsi',
 "rasputin's",
 "cooper's",
 'astraunaut',
 'hyperloop',
 "what's",
 'rajastan',
 "government's",
 "ender's",
 "ireland's",
 "seniors's",
 'hasnflare',
 "they're",
 "i'm",
 "i'll",
 'durgesh',
 "one's",
 "eu's",
 'giichi',
 "laptop's",
 'iub',
 'mscs',
 'kissanime',
 'airbackuphelper',
 'myntra',
 'trudea',
 "won't",
 'su57',
 'kvpy',
 'taccetta',
 'ayushman',
 'tulamben',
 'bmsce',
 'itee',
 "indonesia's",
 "haven't",
 "aren't",
 'malrotated',
 'ivanca',
 'gynobasic',
 "clinton's",
 "professional's",
 'aasi',
 'acturial',
 "he's",
 'udemy',
 "shouldn't",
 "holy-shit-that's-a-monstrosity",
 'scif',
 'mclr',
 'waterbridge',
 'fadh',
 'gurkas',
 'ncell',
 'supervillain-related',
 'basavanagudi',
 'beie',
 "framer's",
 ')=',
 'arcsinx',
 '2arcsinx',
 "women's",
 'auccant',
 "elon's",
 'neurosoft',
 'cloudtms',
 "you've",
 "she's",
 'lakshmibai',
 'obcs',
 "weren't",
 'sendgrid',
 "qur'an",
 "tillman's",
 'navaj',
 '91.1-',
 'terfs',
 'toogit.com

https://wikidocs.net/21707

In [None]:
# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import PorterStemmer
s=PorterStemmer()
from nltk.stem import LancasterStemmer
l=LancasterStemmer()
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()

glove_oov = []
i = 0
for word in tqdm(vocab):
    if i >= max_features: break

    tmp = word
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = word.lower()
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = word.upper()
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = word.capitalize()
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = s.stem(word)
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = l.stem(word)
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = n.lemmatize(word)
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    

    else:
        glove_oov.append(word)
    
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / max_features * 100))

HBox(children=(FloatProgress(value=0.0, max=257897.0), HTML(value='')))

percentage of oov of glove: 10.59%


In [None]:
glove_oov

['DCEU',
 "China's",
 'mahab',
 'AlShamsi',
 "Rasputin's",
 "Cooper's",
 'astraunaut',
 "What's",
 "government's",
 "Ender's",
 "Ireland's",
 "seniors's",
 'Hasnflare',
 "they're",
 "one's",
 "EU's",
 'Giichi',
 "laptop's",
 'KissAnime',
 'AirBackupHelper',
 "won't",
 'su57',
 'Taccetta',
 'BMSCE',
 "Indonesia's",
 "haven't",
 "aren't",
 'malrotated',
 'Ivanca',
 'gynobasic',
 "Clinton's",
 "professional's",
 "he's",
 "shouldn't",
 "Holy-Shit-That's-A-Monstrosity",
 'Gurkas',
 'supervillain-related',
 "Framer's",
 ')=',
 'arcsinx',
 '2arcsinx',
 "women's",
 'auccant',
 "Elon's",
 'Neurosoft',
 'CloudTMS',
 "you've",
 "she's",
 "weren't",
 "Qur'an",
 "Tillman's",
 '91.1-',
 'toogit.com',
 "someone's",
 "bachelor's",
 "God's",
 "today's",
 "She's",
 "wasn't",
 'hyperIQ',
 "person's",
 "There's",
 'MU-OET',
 'budhisim',
 "son's",
 "Asperger's",
 "isn't",
 "world's",
 "Cubicolor's",
 'Bluemix',
 "Wouldn't",
 'breechloaders',
 'cryptocurrencies',
 'Fileboard',
 'Outreach.io',
 'suvject',
 '

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [None]:
def get_known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known

In [None]:
get_known_contractions(glove_embedding)

["can't",
 "'cause",
 "didn't",
 "doesn't",
 "don't",
 "I'd",
 "I'll",
 "I'm",
 "I've",
 "it's",
 "ma'am",
 "o'clock",
 "that's",
 "you'll",
 "you're"]

In [None]:
import re

def tokenize(sent):
    known_contractions = get_known_contractions(glove_embedding)
    contraction_keys = set(contraction_mapping.keys()) - set(known_contractions)
    for contraction in contraction_keys:
        sent = re.sub(contraction, contraction_mapping[contraction], sent)
        sent = re.sub(contraction.capitalize(), contraction_mapping[contraction].capitalize(), sent)
        sent = re.sub(contraction.upper(), contraction_mapping[contraction].upper(), sent)
    return tweet_tknzr.tokenize(sent)

In [None]:
tokenize("You've got it")

['You', 'have', 'got', 'it']

In [None]:
"you've" in contraction_mapping

True

In [None]:
"you've".capitalize()

"You've"

In [None]:
sentences = train_df["question_text"].fillna("_na_").progress_apply(tokenize).values

HBox(children=(FloatProgress(value=0.0, max=1175509.0), HTML(value='')))




In [None]:
vocab = build_vocab(sentences)

HBox(children=(FloatProgress(value=0.0, max=1175509.0), HTML(value='')))




In [None]:
from nltk.stem import PorterStemmer
s=PorterStemmer()
from nltk.stem import LancasterStemmer
l=LancasterStemmer()
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()

glove_oov = {}
i = 0
for word in tqdm(vocab):
    if i >= max_features: break

    tmp = word
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = word.lower()
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = word.upper()
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = word.capitalize()
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = s.stem(word)
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = l.stem(word)
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = n.lemmatize(word)
    if tmp in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    if word in contraction_mapping.keys():
        tmp =  contraction_mapping[word]    
        if tmp in glove_embedding:
            glove_embedding_matrix[i] = glove_embedding[tmp]
            i += 1
            continue


    else:
        glove_oov[word] = i
    
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / max_features * 100))

HBox(children=(FloatProgress(value=0.0, max=257764.0), HTML(value='')))

percentage of oov of glove: 3.13%


In [None]:
vocab

In [None]:
glove_oov

{"Trump's": 1117,
 '..': 1949,
 '\u200b': 2001,
 "today's": 2091,
 "someone's": 2116,
 "one's": 2346,
 "India's": 2353,
 "people's": 2454,
 "world's": 3083,
 "women's": 3157,
 'cryptocurrencies': 3157,
 'Brexit': 3181,
 "person's": 3243,
 "China's": 3552,
 'Redmi': 3750,
 "country's": 3959,
 'ा': 4099,
 "master's": 4265,
 "America's": 4317,
 "men's": 4361,
 "friend's": 4898,
 "Quora's": 4972,
 "bachelor's": 4982,
 "Obama's": 5004,
 "Earth's": 5136,
 "God's": 5239,
 "man's": 5264,
 '₹': 5391,
 "woman's": 5395,
 "company's": 5511,
 "else's": 5667,
 "father's": 5942,
 "children's": 5947,
 "Master's": 6005,
 "child's": 6199,
 "Modi's": 6320,
 "mother's": 6374,
 "girl's": 6573,
 "dog's": 6679,
 "McDonald's": 6782,
 "Google's": 6805,
 "wife's": 7229,
 "other's": 7704,
 "Newton's": 7891,
 "Korea's": 7984,
 "husband's": 8001,
 "Russia's": 8027,
 'OnePlus': 8100,
 "driver's": 8204,
 "Hitler's": 8239,
 'ि': 8441,
 "Asperger's": 8450,
 "Israel's": 8454,
 '(8': 8478,
 "earth's": 8507,
 "year's": 8

In [None]:
from nltk.stem import PorterStemmer
s=PorterStemmer()
from nltk.stem import LancasterStemmer
l=LancasterStemmer()
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()

glove_oov = {}
i = 0
for word in tqdm(vocab):
    # if i >= max_features: break

    tmp = word
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = word.lower()
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = word.upper()
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = word.capitalize()
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = s.stem(word)
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue
    
    tmp = l.stem(word)
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = n.lemmatize(word)
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue

    tmp = word.split("'")[0]
    if tmp in glove_embedding:
        # glove_embedding_matrix[i] = glove_embedding[tmp]
        i += 1
        continue





    else:
        glove_oov[word] = i
    
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / len(vocab) * 100))

NameError: ignored

In [None]:
'scikit-learn' in glove_embedding

True

In [None]:
'sklearn' in glove_embedding

False

In [None]:
'Airpods' in glove_embedding

False

In [None]:
n.lemmatize("Airpods")

'Airpods'

In [None]:
s.stem("Airpods")

'airpod'

In [None]:
l.stem("Airpods")

'airpod'

In [None]:
glove_oov

{'..': 1950,
 '\u200b': 2002,
 'cryptocurrencies': 3165,
 'Brexit': 3189,
 'Redmi': 3760,
 'ा': 4110,
 '₹': 5412,
 'OnePlus': 8140,
 'ि': 8483,
 '(8': 8522,
 'demonetisation': 9373,
 'GDPR': 9584,
 '्': 9662,
 'BNBR': 9675,
 'Coinbase': 9708,
 'े': 9820,
 'Adityanath': 10324,
 'Boruto': 10608,
 'IIEST': 10805,
 'DCEU': 11061,
 'alt-right': 11116,
 'Machedo': 11736,
 'anti-Trump': 11942,
 'र': 11955,
 ')=': 12254,
 'Qoura': 12535,
 'LNMIIT': 12540,
 'न': 12622,
 'ी': 12671,
 'Zerodha': 12713,
 'Upwork': 12837,
 '}=': 13044,
 'क': 13088,
 'Doklam': 13707,
 '):': 13802,
 'Unacademy': 13848,
 'NICMAR': 13899,
 'Vajiram': 13976,
 'MUOET': 14247,
 'ल': 14261,
 'म': 14368,
 'Litecoin': 14880,
 'HackerRank': 15344,
 "Kavalireddi's": 15429,
 'altcoin': 15539,
 'Baahubali': 15571,
 'Awdhesh': 15685,
 'eLitmus': 15751,
 'ो': 15906,
 'AlShamsi': 15935,
 'Ryzen': 15958,
 'altcoins': 16064,
 'r-aping': 16277,
 'coinbase': 16318,
 ':(': 16323,
 ');': 16545,
 'MU-OET': 16653,
 'ं': 16762,
 'SGSITS': 1

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
'aBc'.capitalize()

'Abc'

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
s = train_df["question_text"].values[312]

In [None]:
s

'Why does Israel act as if it is an independent country? Isn’t it a client state of the USA just as it’s predecessor from 2000 years ago was a client Kingdom of the Romans and before that Greeks, before that the Persians and before that Babylonia?'

In [None]:
tknzr = Tokenizer(lower=False)
tknzr.fit_on_texts([s])

In [None]:
tknzr.word_index

{'2000': 26,
 'Babylonia': 34,
 'Greeks': 32,
 'Isn’t': 19,
 'Israel': 12,
 'Kingdom': 30,
 'Persians': 33,
 'Romans': 31,
 'USA': 21,
 'Why': 10,
 'a': 6,
 'act': 13,
 'ago': 28,
 'an': 16,
 'and': 9,
 'as': 4,
 'before': 2,
 'client': 7,
 'country': 18,
 'does': 11,
 'from': 25,
 'if': 14,
 'independent': 17,
 'is': 15,
 'it': 5,
 'it’s': 23,
 'just': 22,
 'of': 8,
 'predecessor': 24,
 'state': 20,
 'that': 3,
 'the': 1,
 'was': 29,
 'years': 27}

In [None]:
nltk.word_tokenize(train_df["question_text"].values[312])

['Why',
 'does',
 'Israel',
 'act',
 'as',
 'if',
 'it',
 'is',
 'an',
 'independent',
 'country',
 '?',
 'Isn',
 '’',
 't',
 'it',
 'a',
 'client',
 'state',
 'of',
 'the',
 'USA',
 'just',
 'as',
 'it',
 '’',
 's',
 'predecessor',
 'from',
 '2000',
 'years',
 'ago',
 'was',
 'a',
 'client',
 'Kingdom',
 'of',
 'the',
 'Romans',
 'and',
 'before',
 'that',
 'Greeks',
 ',',
 'before',
 'that',
 'the',
 'Persians',
 'and',
 'before',
 'that',
 'Babylonia',
 '?']

In [None]:
"'" in glove_embedding

True

https://www.kaggle.com/bkkaggle/pytorch-determinism-test

In [None]:
import torchtext
from torchtext.legacy import data

https://torchtext.readthedocs.io/en/latest/data.html#field

https://wikidocs.net/60314

In [None]:
q_field = data.Field(tokenize=tweet_tknzr.tokenize, batch_first=True)

In [None]:
q_field

torchtext.legacy.data.field.Field

In [None]:
from torchtext.legacy.data import TabularDataset

In [None]:
question_field = data.Field(tokenize=tweet_tknzr.tokenize, lower=True, batch_first=True, include_lengths=True)
target_field = data.Field(sequential=False, use_vocab=False, batch_first=True)

train_fields = [
    ('id', None),
    ('qid', None),
    ('question_text', question_field),
    ('target', target_field)
]

test_fields = [
    ('qid', None),
    ('question_text', question_field)
]

In [None]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

In [None]:
SEED = 31
FOLD = 0
NOTIFY_EACH_EPOCH = False

WORKERS = 0
BATCH_SIZE = 512

N_SPLITS = 10

In [None]:
kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

train_idx, val_idx = list(kfold.split(train_df))[FOLD]
x_train, x_val = train_df.iloc[train_idx], train_df.iloc[val_idx]

x_train.to_csv('train.csv')
x_val.to_csv('val.csv')

In [None]:
list(kfold.split(train_df))[1]

(array([      0,       1,       2, ..., 1175505, 1175506, 1175508]),
 array([      6,      26,      34, ..., 1175495, 1175502, 1175507]))

In [None]:
train, val = TabularDataset.splits(path='./',
                                   train='train.csv',
                                   validation='val.csv',
                                   format='CSV',
                                   fields=train_fields,
                                   skip_header=True
                                   )

In [None]:
v = question_field.build_vocab(train)

In [None]:
question_field.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fcc8655eb90>>,
            {'<unk>': 0,
             '<pad>': 1,
             '?': 2,
             'the': 3,
             'what': 4,
             'is': 5,
             'a': 6,
             'to': 7,
             'in': 8,
             'of': 9,
             'i': 10,
             'how': 11,
             'and': 12,
             'do': 13,
             'are': 14,
             ',': 15,
             'for': 16,
             'you': 17,
             'can': 18,
             'why': 19,
             'it': 20,
             '.': 21,
             'my': 22,
             'that': 23,
             'if': 24,
             'with': 25,
             'or': 26,
             'on': 27,
             'have': 28,
             'be': 29,
             'does': 30,
             '"': 31,
             'from': 32,
             'your': 33,
             'an': 34,
             'which': 35,
             'should': 36,
             'when': 37,


In [None]:
vectors = torchtext.vocab.Vectors('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')

RuntimeError: ignored

In [None]:
question_field.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

NameError: ignored

references:

https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

https://www.kaggle.com/alhalimi/tokenization-and-word-embedding-compatibility

https://www.kaggle.com/canming/ensemble-mean-iii-64-36

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize

In [None]:
s =  "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"

In [None]:
# nltk.download('punkt')
word_tokenize(s)

['This',
 'is',
 'a',
 'cooool',
 '#',
 'dummysmiley',
 ':',
 ':',
 '-',
 ')',
 ':',
 '-P',
 '<',
 '3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '-',
 '>',
 '<',
 '--']

In [None]:
t = TweetTokenizer(reduce_len=True)

In [None]:
s1 = 'heyyyy nice to meet you!!'

In [None]:
t.tokenize(s1)

['heyyy', 'nice', 'to', 'meet', 'you', '!', '!']

In [None]:
import gc

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
import os

# input_path = '/kaggle/input/quora-insincere-questions-classification'
input_path = '/content/drive/MyDrive/ColabNotebooks/QIQC/data'
train_path = os.path.join(input_path, 'train.csv')
test_path = os.path.join(input_path, 'test.csv')
embeddings_path = os.path.join(input_path, 'embeddings.zip')

In [None]:
if os.path.exists(train_path): train_df = pd.read_csv(train_path)
else: train_df = pd.read_csv(train_path + '.zip')

if os.path.exists(test_path): test_df = pd.read_csv(train_path)
else: test_df = pd.read_csv(test_path + '.zip')

print('train shape:', train_df.shape)
print('test shape:', test_df.shape)

train shape: (1306122, 3)
test shape: (1306122, 3)


In [None]:
from sklearn.model_selection import train_test_split

# split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1)

# some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

# fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

# Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
%%time
from keras.preprocessing.text import Tokenizer

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

CPU times: user 50.2 s, sys: 508 ms, total: 50.7 s
Wall time: 50.6 s


In [None]:
%%time
from keras.preprocessing.sequence import pad_sequences

# Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

CPU times: user 10.3 s, sys: 304 ms, total: 10.6 s
Wall time: 10.4 s


In [None]:
import zipfile
from gensim.models import KeyedVectors

glove = 'glove.840B.300d/glove.840B.300d.txt'
wiki = 'wiki-news-300d-1M/wiki-news-300d-1M.vec'
google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
para = 'paragram_300_sl999/paragram_300_sl999.txt'


def load_embedding(embedding_name):

    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

    with zipfile.ZipFile(embeddings_path) as embeddings_zip:
        print("Found embeddings as a zip file")

        if embedding_name == google:
            return KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

        else:
            embedding = []
            for o in embeddings_zip.open(embedding_name):
                try:
                    if len(o.decode('utf-8')) > 100:
                        embedding.append(get_coefs(*o.decode('utf-8').split(" ")))
                except:
                    pass
        
        return dict(embedding)

In [None]:
%%time
glove_embedding = load_embedding(glove) # 3min 21s in Colab

Found embeddings as a zip file
CPU times: user 3min 13s, sys: 3.82 s, total: 3min 17s
Wall time: 3min 18s


In [None]:
%%time
wiki_embedding = load_embedding(wiki) # 1min 20s in Colab

Found embeddings as a zip file
CPU times: user 1min 19s, sys: 2.1 s, total: 1min 21s
Wall time: 1min 21s


In [None]:
%%time
google_embedding = load_embedding(google) # 1min 50s in Colab

Found embeddings as a zip file
CPU times: user 1min 46s, sys: 3.65 s, total: 1min 50s
Wall time: 1min 50s


In [None]:
%%time
para_embedding = load_embedding(para) # 2min 40s in Colab

Found embeddings as a zip file
CPU times: user 2min 34s, sys: 2.81 s, total: 2min 36s
Wall time: 2min 36s


In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

In [None]:
has_get_emb_stats = True # set False if we have not get emb stats
if not has_get_emb_stats:

    def get_emb_stats(embedding):

        # Put all embeddings in a numpy matrix
        try:
            all_embs= np.stack(embedding.values())
        except:
            all_embs = embedding.vectors

        # Get embedding stats
        emb_mean = all_embs.mean()
        emb_std = all_embs.std()
        
        return emb_mean,emb_std
    

    glove_emb_mean, glove_emb_std = get_emb_stats(glove_embedding)
    wiki_emb_mean, wiki_emb_std = get_emb_stats(wiki_embedding)
    google_emb_mean, google_emb_std = get_emb_stats(google_embedding)
    para_emb_mean, para_emb_std = get_emb_stats(para_embedding)

    print('glove emb mean:', glove_emb_mean, 'emb_std:', glove_emb_std)
    print('wiki emb mean:', wiki_emb_mean, 'emb_std:', wiki_emb_std)
    print('google emb mean:', google_emb_mean, 'emb_std:', google_emb_std)
    print('para emb mean:', para_emb_mean, 'emb_std:', para_emb_std)

else:
    glove_emb_mean, glove_emb_std = -0.005838499, 0.48782197
    wiki_emb_mean, wiki_emb_std = -0.0033469985, 0.109855495
    google_emb_mean, google_emb_std = -0.003527845, 0.13315111
    para_emb_mean, para_emb_std = -0.0053248387, 0.49346521

In [None]:
glove_embedding_matrix = np.random.normal(glove_emb_mean, glove_emb_std, (max_features, embed_size))
wiki_embedding_matrix = np.random.normal(wiki_emb_mean, wiki_emb_std, (max_features, embed_size))
google_embedding_matrix = np.random.normal(google_emb_mean, google_emb_std, (max_features, embed_size))
para_embedding_matrix = np.random.normal(para_emb_mean, para_emb_std, (max_features, embed_size))

In [None]:
glove_oov = {}
wiki_oov = {}
google_oov = {}
para_oov = {}
for word, i in tqdm(word_index.items()):
    if i >= max_features: continue
    if word in glove_embedding:
        embedding_vector = glove_embedding[word]
        glove_embedding_matrix[i] = embedding_vector
    else:
        glove_oov[word] = i
    if word in wiki_embedding:
        embedding_vector = wiki_embedding[word]
        wiki_embedding_matrix[i] = embedding_vector
    else:
        wiki_oov[word] = i
    if word in google_embedding:
        embedding_vector = google_embedding[word]
        google_embedding_matrix[i] = embedding_vector
    else:
        google_oov[word] = i
    if word in para_embedding:
        embedding_vector = para_embedding[word]
        para_embedding_matrix[i] = embedding_vector
    else:
        para_oov[word] = i

HBox(children=(FloatProgress(value=0.0, max=209512.0), HTML(value='')))




In [None]:
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / max_features * 100))
print('percentage of oov of wiki: {:.2f}%'.format(len(wiki_oov) / max_features * 100))
print('percentage of oov of google: {:.2f}%'.format(len(google_oov) / max_features * 100))
print('percentage of oov of para: {:.2f}%'.format(len(para_oov) / max_features * 100))

percentage of oov of glove: 8.02%
percentage of oov of wiki: 14.56%
percentage of oov of google: 23.63%
percentage of oov of para: 5.55%


We will clean the numbers.

In [None]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    
    return x

In [None]:
no_num_tokenizer = Tokenizer()
no_num_tokenizer.fit_on_texts(list(train_df['question_text'].fillna('__na__').progress_apply(lambda x: clean_numbers(x)).values))

In [None]:
import random

for i in range(10):
    n = random.randint(0, 20000)
    if str(n) not in glove_embedding: print(n, 'is not in glove embedding')
    if str(n) not in wiki_embedding: print(n, 'is not in wiki embedding')
    # if str(n) not in google_embedding: print(n, 'is not in google embedding')
    if str(n) not in para_embedding: print(n, 'is not in para embedding')

google embedding only have few numbers so we will clean all the numbers.

It seems like other embeddings have numbers up to 10,000 so we will not clean the numbers.

In [None]:
no_num_word_index = no_num_tokenizer.word_index

google_oov = {}
for word, i in tqdm(no_num_word_index.items()):
    if i >= max_features: continue
    if word in google_embedding:
        embedding_vector = google_embedding[word]
        google_embedding_matrix[i] = embedding_vector
    else:
        google_oov[word] = i

In [None]:
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / max_features * 100))
print('percentage of oov of wiki: {:.2f}%'.format(len(wiki_oov) / max_features * 100))
print('percentage of oov of google: {:.2f}%'.format(len(google_oov) / max_features * 100))
print('percentage of oov of para: {:.2f}%'.format(len(para_oov) / max_features * 100))

percentage of oov of google: 22.63% -> 22.13%

Some trials to make better performance..

In [None]:
"don't" in google_embedding

In [None]:
google_oov

In [None]:
"i'm" in glove_oov

In [None]:
glove_oov

In [None]:
"you’re" in glove_embedding

In [None]:
for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
    if punct not in glove_embedding:
        print(punct)

# Although “”’are in glove_embedding but we will replace it for convinience

In [None]:
word = "someone’s"
word = word.replace("’", "'")
if word.endswith("'s") or word.endswith("' s"):
    w1, w2 = word.split("'")
print(w1)
print(w2)

In [None]:
glove_oov = {}
for word, i in tqdm(word_index.items()):
    if i >= max_features: continue

    if word in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[word]
        continue

    word = word.replace("’", "'")
    word = word.replace('”', '"')
    word = word.replace('“', '"')
    if word in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[word]
        continue
    
    word = re.sub(' +', '', word)
    if word in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[word]
        continue

    tmp_word = word.split("'")[0]
    if tmp_word in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[tmp_word]
        continue
    
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~':
        word = word.replace(punct, '')
    if word in glove_embedding:
        glove_embedding_matrix[i] = glove_embedding[word]
        continue
    
    # quorans -> quora contributors
    

    else:
        glove_oov[word] = i

In [None]:
print('percentage of oov of glove: {:.2f}%'.format(len(glove_oov) / max_features * 100))

8.15% -> 4.18%

In [None]:
'Quora' in glove_embedding

In [None]:
for k in word_index.keys():
    if word_index[k] == 49990:
        print(k)

In [None]:
word_index["'call"]

In [None]:
''

In [None]:
glove_oov