In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re
import pickle
import nltk

from gensim.models import word2vec

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('../input/train.csv').sample(50000, random_state=23)
STOP_WORDS = nltk.corpus.stopwords.words()

In [4]:
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")

    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)

    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data.dropna(how="any")

    for col in ['question1', 'question2']:
        print 'Cleaning col ' + col
        data[col] = data[col].apply(clean_sentence)

    return data

print('Cleaning data...')
data = clean_dataframe(data)
print('Data cleaning done...')


Cleaning data...
Cleaning col question1
Cleaning col question2
Data cleaning done...


In [5]:
data[:5]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
237921,237921,9732,79801,sex necessary relationship,sex important good relationship,1
181001,181001,277377,277378,inspiring start stories,inspirational stories ever,0
294691,294691,150129,93109,best way digital marketing,best unique ways digital marketing,1
104145,104145,171986,171987,best way grow facebook fan page,get followers facebook page,1
357893,357893,487310,487311,suppose host sends two tcp segments back back ...,suppose host sends two tcp segments back back ...,1


In [6]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    c = 1
    for col in ['question1', 'question2']:
        for sentence in data[col].iteritems():
            word_list = sentence[1].split(" ")
            corpus.append(word_list)
    return corpus

In [7]:
print('Building corpus for word2vec...')
corpus = build_corpus(data)
print('Corpus ready...')

print('Running word2vec...')
model = word2vec.Word2Vec(corpus, size=100, window=20, workers=4)
print('Word2Vec done...')


Building corpus for word2vec...
Corpus ready...
Running word2vec...
Word2Vec done...


In [8]:
def build_training_data(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for sentence1, sentence2 in zip(data['question1'].iteritems(), data['question2'].iteritems()):
        pair = []
        word_list1 = sentence1[1].split(" ")
        word_list2 = sentence2[1].split(" ")
        pair.append(word_list1)
        pair.append(word_list2)
        corpus.append(pair)
    return corpus

training_corpus = build_training_data(data)

In [9]:
data.question1.values

array(['sex necessary relationship', 'inspiring start stories',
       'best way digital marketing', ...,
       'cant uninstall cm security app mobile uninstall app pc',
       'calculate load ups', 'examples taproot plants'], dtype=object)

In [10]:
def add_col(data, model, col_name, new_col_name, length=0):
    val = []
    for sentence in data[col_name].values:
        sentence_vec = [model.wv[word] for word in sentence.split(" ") if word in model.vocab]
        if length:
            val.append(len(sentence_vec))
        else:
            val.append(np.array(sentence_vec))
    
    data[new_col_name] = np.array(val)
    return data

In [11]:
data = add_col(data, model, 'question1', 'vec1') 

In [12]:
data = add_col(data, model, 'question2', 'vec2') 

In [13]:
data = add_col(data, model, 'question1', 'len1', length=1) 
data = add_col(data, model, 'question2', 'len2', length=1) 

In [14]:
data[:5]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,vec1,vec2,len1,len2
237921,237921,9732,79801,sex necessary relationship,sex important good relationship,1,"[[-0.703642, 0.0172268, -0.999159, -1.07369, 1...","[[-0.703642, 0.0172268, -0.999159, -1.07369, 1...",3,4
181001,181001,277377,277378,inspiring start stories,inspirational stories ever,0,"[[0.0174034, 0.0591483, -0.168834, -0.172296, ...","[[-0.0313487, 0.073989, -0.304423, -0.321687, ...",3,3
294691,294691,150129,93109,best way digital marketing,best unique ways digital marketing,1,"[[0.679356, -0.274707, -0.104487, -1.28028, -1...","[[0.679356, -0.274707, -0.104487, -1.28028, -1...",4,5
104145,104145,171986,171987,best way grow facebook fan page,get followers facebook page,1,"[[0.679356, -0.274707, -0.104487, -1.28028, -1...","[[0.332248, 0.943302, 0.209303, -0.0485773, 1....",6,4
357893,357893,487310,487311,suppose host sends two tcp segments back back ...,suppose host sends two tcp segments back back ...,1,"[[-0.0194233, 0.104621, -0.211908, -0.235154, ...","[[-0.0194233, 0.104621, -0.211908, -0.235154, ...",19,19


In [15]:
vec1 = data.vec1.values
vec2 = data.vec2.values
labels = data.is_duplicate.values

In [168]:
class SimpleDataIterator():
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()

    def shuffle(self):
        self.df = self.df.sample(frac=1).reset_index(drop=True)
        self.cursor = 0

    def next_batch(self, n):
        if self.cursor+n-1 > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.ix[self.cursor:self.cursor+n-1]
        self.cursor += n
        return res['vec1'], res['vec2'], res['is_duplicate']

In [169]:
it = SimpleDataIterator(data)

In [170]:
d = it.next_batch(3)

In [173]:
print(d[0])

0    [[0.0405326, 0.257707, -0.410567, -0.326391, -...
1    [[0.341673, 0.422072, 0.00380491, -0.0532508, ...
2    [[0.315736, 0.215481, 0.130308, -0.117091, 0.2...
Name: vec1, dtype: object


In [178]:
class PaddedDataIterator(SimpleDataIterator):
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.ix[self.cursor:self.cursor+n-1]
        self.cursor += n

        # Pad sequences with 0s so they are all the same length
        maxlen1 = max(res['len1'])
        maxlen2 = max(res['len2'])
        maxlen = max(maxlen1, maxlen2)
        x = np.zeros([n, maxlen], dtype=np.int32)
#         for i, x_i in enumerate(x):
#             x_i[:res['length'].values[i]] = res['as_numbers'].values[i]

#         return x, res['gender']*3 + res['age_bracket'], res['length']

In [16]:
with open('column1_vec_representations.pkl', 'wb') as output:
    pickle.dump(vec1, output, pickle.HIGHEST_PROTOCOL)

with open('column2_vec_representations.pkl', 'wb') as output:
    pickle.dump(vec2, output, pickle.HIGHEST_PROTOCOL)
    
with open('labels.pkl', 'wb') as output:
    pickle.dump(labels, output, pickle.HIGHEST_PROTOCOL)
    

In [None]:
data.to_csv('../input/train_preprocessed', sep=',')

In [17]:
model.save('word2vec_model.model')

In [18]:
a = word2vec.Word2Vec.load('word2vec_model.model')

In [19]:
a['trump']

array([-1.72673047,  1.97488725, -0.73923886, -0.70534426,  2.37984324,
        0.4549852 , -1.58750951, -0.71555549,  1.8154124 , -0.36023772,
        3.04060483, -1.8609041 , -0.99346739,  0.65570825,  0.38605696,
       -0.42184335,  1.56381726,  0.92177492, -1.6276325 , -0.38147801,
       -1.3439889 ,  1.12840378,  1.30797195,  1.71170878,  0.23091172,
       -0.54669994,  0.69720364, -0.4015339 ,  1.56002724, -0.23599894,
        1.50938451, -1.61150014,  1.2631675 ,  0.15949842,  1.0134598 ,
       -1.02654815, -2.31452346, -0.6969772 , -1.24678707,  2.37837386,
       -1.92408228,  1.33331895, -1.91136456,  0.13270813, -0.40521616,
       -0.33205706,  0.59571409, -0.93542624,  0.34086499, -0.16417426,
       -0.13299473, -1.73086333,  0.05750199,  0.41896284, -1.44527817,
       -0.81394243,  1.24727678, -1.08212066,  1.58733022, -1.39493787,
        0.42367527, -1.25193429, -0.61892807,  4.32673168,  0.80436039,
       -0.58575404, -1.12379098,  1.18451834, -0.73776513, -1.56