<a href="https://colab.research.google.com/github/dohyun1411/Quora-Insincere-Questions-Classification/blob/preprocessing1/clean_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Clean text

reference: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings#

In the reference, he found embeddings(Goolgle News) for 60.43% of vocab and 98.96% of all text.

additional reference: https://www.kaggle.com/wowfattie/3rd-place

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"dohyun141","key":"20ec72dbef42ea6e82f13fb6c1d11b74"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c quora-insincere-questions-classification

Downloading test.csv.zip to /content
 57% 9.00M/15.8M [00:00<00:00, 26.6MB/s]
100% 15.8M/15.8M [00:00<00:00, 35.7MB/s]
Downloading train.csv.zip to /content
 84% 46.0M/54.9M [00:00<00:00, 37.2MB/s]
100% 54.9M/54.9M [00:01<00:00, 54.0MB/s]
Downloading sample_submission.csv.zip to /content
100% 4.09M/4.09M [00:00<00:00, 27.9MB/s]

Downloading embeddings.zip to /content
100% 5.95G/5.96G [01:12<00:00, 76.4MB/s]
100% 5.96G/5.96G [01:13<00:00, 87.5MB/s]


In [1]:
import gc

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

import tensorflow as tf

  from pandas import Panel


Please change 'input_path' to where the input data is.

In [2]:
import os

# input_path = '/kaggle/input/quora-insincere-questions-classification'
input_path = './'
train_path = os.path.join(input_path, 'train.csv')
test_path = os.path.join(input_path, 'test.csv')
embeddings_path = os.path.join(input_path, 'embeddings.zip')

In [3]:
if os.path.exists(train_path): train = pd.read_csv(train_path)
else: train = pd.read_csv(train_path + '.zip')

if os.path.exists(test_path): test = pd.read_csv(train_path)
else: test = pd.read_csv(train_path + '.zip')

print('train shape:', train.shape)
print(train.head())
print()
print('test shape:', test.shape)
print(test.head())

train shape: (1306122, 3)
                    qid  ... target
0  00002165364db923c7e6  ...      0
1  000032939017120e6e44  ...      0
2  0000412ca6e4628ce2cf  ...      0
3  000042bf85aa498cd78e  ...      0
4  0000455dfa3e01eae3af  ...      0

[5 rows x 3 columns]

test shape: (1306122, 3)
                    qid  ... target
0  00002165364db923c7e6  ...      0
1  000032939017120e6e44  ...      0
2  0000412ca6e4628ce2cf  ...      0
3  000042bf85aa498cd78e  ...      0
4  0000455dfa3e01eae3af  ...      0

[5 rows x 3 columns]


In [4]:
train_text = train['question_text']
test_text = test['question_text']

In [5]:
%%time
import zipfile
from gensim.models import KeyedVectors

google = 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

with zipfile.ZipFile(embeddings_path) as embeddings_zip:
    print("Found embeddings as a zip file")
    google_embeddings = KeyedVectors.load_word2vec_format(embeddings_zip.open(google), binary=True)

Found embeddings as a zip file
CPU times: user 2min 10s, sys: 5.02 s, total: 2min 15s
Wall time: 2min 15s


In [6]:
from collections import defaultdict

def build_vocab(sentences, verbose=1):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """

    vocab = defaultdict(int)
    for sentence in tqdm(sentences, disable=(not verbose)):
        for word in sentence:
            try:
                word = word.text
            except:
                pass
            vocab[word] += 1
    return vocab

In [7]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [8]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [9]:
train_text = train_text.progress_apply(replace_typical_misspell)

HBox(children=(FloatProgress(value=0.0, max=1306122.0), HTML(value='')))




In [10]:
import operator

def check_coverage(vocab, embeddings, verbose=1):
    
    inv = {}
    oov = {}
    count_inv = 0
    count_oov = 0
    
    for word in tqdm(vocab, disable=(not verbose)):
        
        if word in embeddings:
            inv[word] = embeddings[word]
            count_inv += vocab[word]
        
        else:
            oov[word] = vocab[word]
            count_oov += vocab[word]

    print('Found embeddings for {:.2%} of vocab.'.format(len(inv) / len(vocab)))
    print('Found embeddings for {:.2%} of all text.'.format(count_inv / (count_inv + count_oov)))

    return sorted(oov.items(), key=operator.itemgetter(1))[::-1]

Using spaCy

In [11]:
import spacy

# !pip install -U spacy[cuda100]
gpu = spacy.prefer_gpu()
print('GPU:', gpu)

GPU: True


In [19]:
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

In [21]:
docs = nlp.pipe(train_text)

sentences = []
for doc in tqdm(docs): # 
    sentence = []
    for token in doc:
        if token.is_alpha:
            sentence.append(token.text)
        else:
            sentence.append(clean_numbers(token.text))
    sentences.append(sentence)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: ignored

In [None]:
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

In [None]:
oov = check_coverage(vocab, google_embeddings)

Stuck in time out! (more than 1 hours..)

Below code is just copy of https://www.kaggle.com/wowfattie/3rd-place.

In [None]:
print("Spacy NLP ...")
nlp = spacy.load('en_core_web_lg', disable=['parser','ner','tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
word_dict = {}
word_index = 1
lemma_dict = {}
docs = nlp.pipe(train_text)
word_sequences = []
for doc in tqdm(docs):
    word_seq = []
    for token in doc:
        if (token.text not in word_dict) and (token.pos_ is not "PUNCT"):
            word_dict[token.text] = word_index
            word_index += 1
            lemma_dict[token.text] = token.lemma_
        if token.pos_ is not "PUNCT":
            word_seq.append(word_dict[token.text])
    word_sequences.append(word_seq)
del docs
gc.collect()