https://www.kaggle.com/lystdo/quora-question-pairs/lstm-with-word2vec-embeddings

In [31]:
# import packages
import pandas as pd
import logging
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from gensim.models import word2vec

In [16]:
# set variables
DATA_DIR = '../Data/'
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'

## prepare the data

In [17]:
def question_to_words(question, remove_stopwords=False, stem_words=False):
    # convert to lowercase
    question = question.lower()
    
    # remove punctuation and split
    question = re.findall(r'\w+', question, flags = re.UNICODE)
    
    # remove stop words
    words=[]
    if remove_stopwords:
        stops = stopwords.words('english')
        words = [word for word in question if not word in stops]
                
    # stem words
    if stem_words:
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(word) for word in words]
        
    if len(words) > 0:
        return words
    else:
        return question

In [50]:
df_train = pd.read_csv(DATA_DIR+TRAIN_DATA_FILE, nrows=1000)
df_train = df_train[['question1','question2']]
df_test = pd.read_csv(DATA_DIR + TEST_DATA_FILE, nrows=1000)
df_test = df_test[['question1','question2']]

In [51]:
# We merge the two datasets to teach the vocabulary to Word2Vec
df = pd.concat([df_train, df_test])

In [52]:
# To train Word2Vec, it is better to leave stop words
df['words1'] = df['question1'].apply(lambda x: question_to_words(x))
df['words2'] = df['question2'].apply(lambda x: question_to_words(x))
df.head(20)

Unnamed: 0,question1,question2,words1,words2
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,"[what, is, the, step, by, step, guide, to, inv...","[what, is, the, step, by, step, guide, to, inv..."
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,"[what, is, the, story, of, kohinoor, koh, i, n...","[what, would, happen, if, the, indian, governm..."
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,"[how, can, i, increase, the, speed, of, my, in...","[how, can, internet, speed, be, increased, by,..."
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,"[why, am, i, mentally, very, lonely, how, can,...","[find, the, remainder, when, math, 23, 24, mat..."
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,"[which, one, dissolve, in, water, quikly, suga...","[which, fish, would, survive, in, salt, water]"
5,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...","[astrology, i, am, a, capricorn, sun, cap, moo...","[i, m, a, triple, capricorn, sun, moon, and, a..."
6,Should I buy tiago?,What keeps childern active and far from phone ...,"[should, i, buy, tiago]","[what, keeps, childern, active, and, far, from..."
7,How can I be a good geologist?,What should I do to be a great geologist?,"[how, can, i, be, a, good, geologist]","[what, should, i, do, to, be, a, great, geolog..."
8,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?","[when, do, you, use, シ, instead, of, し]","[when, do, you, use, instead, of, and]"
9,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,"[motorola, company, can, i, hack, my, charter,...","[how, do, i, hack, motorola, dcx3400, for, fre..."


## Word embedding

In [53]:
# we parse all questions into a list of sentences
sentences = []

sentences += df['words1'].tolist()
sentences += df['words2'].tolist()

print(len(sentences))
print(sentences[0])

4000
['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india']


In [54]:
# setup word embedding variables
num_features = 300
min_word_count = 40 # mini freq for words to be kept
num_workers = 4
context = 20
downsampling = 1e-3
model_name = 'w2v_' + str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(context) + "context"

In [55]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# learn the vocabulary
model = word2vec.Word2Vec(sentences, 
                          workers = num_workers,
                          size = num_features,
                          min_count = min_word_count,
                          window = context, 
                          sample = downsampling)


model.save(model_name)

2017-05-03 13:00:19,494 : INFO : collecting all words and their counts
2017-05-03 13:00:19,496 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-05-03 13:00:19,522 : INFO : collected 6395 word types from a corpus of 44609 raw words and 4000 sentences
2017-05-03 13:00:19,523 : INFO : Loading a fresh vocabulary
2017-05-03 13:00:19,531 : INFO : min_count=40 retains 118 unique words (1% of original 6395, drops 6277)
2017-05-03 13:00:19,534 : INFO : min_count=40 leaves 24710 word corpus (55% of original 44609, drops 19899)
2017-05-03 13:00:19,537 : INFO : deleting the raw counts dictionary of 6395 items
2017-05-03 13:00:19,540 : INFO : sample=0.001 downsamples 74 most-common words
2017-05-03 13:00:19,541 : INFO : downsampling leaves estimated 9622 word corpus (38.9% of prior 24710)
2017-05-03 13:00:19,542 : INFO : estimated required memory for 118 words and 300 dimensions: 342200 bytes
2017-05-03 13:00:19,545 : INFO : resetting layer weights
2017-05-03 13:00:19

In [56]:
model.most_similar('best')

2017-05-03 13:00:27,072 : INFO : precomputing L2-norms of word weight vectors


[('of', 0.9998316168785095),
 ('by', 0.9998179078102112),
 ('in', 0.9998096823692322),
 ('is', 0.9998082518577576),
 ('most', 0.9998064041137695),
 ('s', 0.9998056888580322),
 ('all', 0.999797523021698),
 ('are', 0.9997949600219727),
 ('some', 0.9997909069061279),
 ('which', 0.9997897148132324)]