In [1]:
import pandas as pd
import numpy as np
import spacy
import time
from tqdm import tqdm
import gc
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

In [2]:
start_time = time.time()
train = pd.read_hdf('../input/train.h5')[:100]
test = pd.read_hdf('../input/test.h5')[:100]
train_text = train['comment_text']
test_text = test['comment_text']
text_list = pd.concat([train_text, test_text])
y = train['target'].values
num_train_data = y.shape[0]
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.2707815170288086 seconds ---


In [3]:
print(len(train))
print(len(test))

100
100


In [4]:
# Third place solution
# seems like the kind of tokenizer you use doesn't matter too much
start_time = time.time()
print("Spacy NLP ...")
nlp = spacy.load('en_core_web_lg', disable=['parser','ner','tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
word_dict = {}
word_index = 1
lemma_dict = {}
docs = nlp.pipe(text_list, n_threads = 2)
word_sequences = []
for doc in tqdm(docs):
    word_seq = []
    for token in doc:
        if (token.text not in word_dict) and (token.pos_ is not "PUNCT"):
            word_dict[token.text] = word_index
            word_index += 1
            lemma_dict[token.text] = token.lemma_
        if token.pos_ is not "PUNCT":
            word_seq.append(word_dict[token.text])
    word_sequences.append(word_seq)
del docs
gc.collect()
# test and train_word_sequences are mapped numeric values for the words/characters
train_word_sequences = word_sequences[:num_train_data]
test_word_sequences = word_sequences[num_train_data:]
print("--- %s seconds ---" % (time.time() - start_time))

Spacy NLP ...


200it [00:00, 1364.86it/s]

--- 7.462452411651611 seconds ---





In [5]:
lemma_dict

{'This': 'This',
 'is': 'be',
 'so': 'so',
 'cool': 'cool',
 '.': '.',
 'It': '-PRON-',
 "'s": 'have',
 'like': 'like',
 ',': ',',
 "'": "'",
 'would': 'would',
 'you': 'you',
 'want': 'want',
 'your': 'your',
 'mother': 'mother',
 'to': 'to',
 'read': 'read',
 'this': 'this',
 '?': '?',
 'Really': 'Really',
 'great': 'great',
 'idea': 'idea',
 'well': 'good',
 'done': 'do',
 '!': '!',
 'Thank': 'Thank',
 'make': 'make',
 'my': 'my',
 'life': 'life',
 'a': 'a',
 'lot': 'lot',
 'less': 'little',
 'anxiety': 'anxiety',
 '-': '-',
 'inducing': 'induce',
 'Keep': 'Keep',
 'it': 'it',
 'up': 'up',
 'and': 'and',
 'do': 'do',
 "n't": 'not',
 'let': 'let',
 'anyone': 'anyone',
 'get': 'get',
 'in': 'in',
 'way': 'way',
 'such': 'such',
 'an': 'a',
 'urgent': 'urgent',
 'design': 'design',
 'problem': 'problem',
 ';': ';',
 'kudos': 'kudo',
 'for': 'for',
 'taking': 'take',
 'on': 'on',
 'Very': 'Very',
 'impressive': 'impressive',
 'Is': 'Is',
 'something': 'something',
 'I': '-PRON-',
 "'ll"

In [17]:
from gensim.models import Word2Vec
def load_glove(word_dict, lemma_dict):
    EMBEDDING_FILE = '../../quora/input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    
    print("loading embedding file")
    start_time = time.time()
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in Word2Vec.load(EMBEDDING_FILE))
    print("--- %s seconds ---" % (time.time() - start_time))
    embed_size = 300
    nb_words = len(word_dict)+1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
    print(unknown_vector[:5])
    for key in tqdm(word_dict):
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        if len(key) > 1:
            #word = correction(key)
            word = key
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[word_dict[key]] = embedding_vector
                continue
        embedding_matrix[word_dict[key]] = unknown_vector                    
    return embedding_matrix, nb_words 


# takes 81 seconds with open
start_time = time.time()
print("Loading embedding matrix ...")
embedding_matrix_glove, nb_words = load_glove(word_dict, lemma_dict)
#embedding_matrix_fasttext, nb_words = load_fasttext(word_dict, lemma_dict)
#embedding_matrix = np.concatenate((embedding_matrix_glove, embedding_matrix_fasttext), axis=1)
print("--- %s seconds ---" % (time.time() - start_time))

Loading embedding matrix ...
loading embedding file


UnpicklingError: invalid load key, ','.

In [14]:
from gensim import utils

def load_word2vec(fname, encoding='utf8', unicode_errors='strict',datatype=np.float32, max_vocab=3000000, word_index=None):
    #emb_mean,emb_std = -0.0051106834, 0.18445626
    #embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, 300))
    embedding_index = {}
    start_time = time.time()
    with utils.smart_open(fname) as fin:
        print("--- %s seconds ---" % (time.time() - start_time))
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split())
        binary_len = np.dtype(datatype).itemsize * vector_size
        
        for _ in tqdm(range(min(vocab_size,max_vocab))):
            # mixed text and binary: read text first, then binary
            word = []
            while True:
                ch = fin.read(1)
                if ch == b' ':
                    break
                if ch == b'':
                    raise EOFError("unexpected end of input")
                if ch != b'\n':
                    word.append(ch)
            word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)
            weights = np.fromstring(fin.read(binary_len), dtype=datatype).astype(np.float16)
            embedding_index[word] = weights
    return embedding_index
start_time = time.time()
load_word2vec('../../quora/input/embeddings/glove.840B.300d/glove.840B.300d.txt')


--- 0.0007839202880859375 seconds ---


ValueError: invalid literal for int() with base 10: ','

In [13]:
print("asd")

asd
