In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re
import pickle
import nltk

import operator
from sklearn.cluster import KMeans

In [2]:
glove_model = pickle.load(open('reduce_glove_wiki.pkl', 'rb'))

In [3]:
word_count = dict()

In [4]:
def count_words(data):
    for col in ['question1', 'question2']:
        print('Searching column: %s' % col)
        for i, sentence in enumerate(data[col]):
            
            if i % 50000 == 0:
                print("Sentence #%s" % i) 
                
            try:
                words = sentence.split(" ")
            except:
                print(sentence)
                break
            for word in words:
                if word in glove_model:
                    if word not in word_count:
                        word_count[word] = 0

                    word_count[word] = word_count[word] + 1
                

In [5]:
data = pd.read_csv('../input/train_cleaned.csv')
data = data.dropna()

In [6]:
count_words(data)

Searching column: question1
Sentence #0
Sentence #50000
Sentence #100000
Sentence #150000
Sentence #200000
Sentence #250000
Sentence #300000
Sentence #350000
Sentence #400000
Searching column: question2
Sentence #0
Sentence #50000
Sentence #100000
Sentence #150000
Sentence #200000
Sentence #250000
Sentence #300000
Sentence #350000
Sentence #400000


In [7]:
data = pd.read_csv('../input/test_cleaned.csv')
data = data.dropna()

In [8]:
count_words(data)

Searching column: question1
Sentence #0
Sentence #50000
Sentence #100000
Sentence #150000
Sentence #200000
Sentence #250000
Sentence #300000
Sentence #350000
Sentence #400000
Sentence #450000
Sentence #500000
Sentence #550000
Sentence #600000
Sentence #650000
Sentence #700000
Sentence #750000
Sentence #800000
Sentence #850000
Sentence #900000
Sentence #950000
Sentence #1000000
Sentence #1050000
Sentence #1100000
Sentence #1150000
Sentence #1200000
Sentence #1250000
Sentence #1300000
Sentence #1350000
Sentence #1400000
Sentence #1450000
Sentence #1500000
Sentence #1550000
Sentence #1600000
Sentence #1650000
Sentence #1700000
Sentence #1750000
Sentence #1800000
Sentence #1850000
Sentence #1900000
Sentence #1950000
Sentence #2000000
Sentence #2050000
Sentence #2100000
Sentence #2150000
Sentence #2200000
Sentence #2250000
Sentence #2300000
Searching column: question2
Sentence #0
Sentence #50000
Sentence #100000
Sentence #150000
Sentence #200000
Sentence #250000
Sentence #300000
Sentence #3

In [9]:
len(word_count)

78887

In [10]:
sorted_dict = sorted(word_count.items(), key=operator.itemgetter(1), reverse=True)

In [11]:
top20k = sorted_dict[:20000]

In [12]:
corpus_words = [word[0] for word in top20k]

In [13]:
len(corpus_words)

20000

In [14]:
corpus_words.append('unk')

In [15]:
len(corpus_words)

20001

In [16]:
remaining_words = sorted_dict[20000:]

In [17]:
remaining_words = [word[0] for word in remaining_words]

In [18]:
dim = 50

In [19]:
kmeans_train_data = np.zeros((len(remaining_words), dim))

In [20]:
for i, word in enumerate(remaining_words):
    kmeans_train_data[i] = glove_model[word]

In [21]:
kmeans_train_data.shape

(58887, 50)

In [22]:
kmeans = KMeans(n_clusters=20, init='k-means++', random_state=0, verbose=1).fit(kmeans_train_data)

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 1025791.29626
start iteration
done sorting
end inner loop
Iteration 1, inertia 985335.430646
start iteration
done sorting
end inner loop
Iteration 2, inertia 970366.780847
start iteration
done sorting
end inner loop
Iteration 3, inertia 961757.680964
start iteration
done sorting
end inner loop
Iteration 4, inertia 955886.243144
start iteration
done sorting
end inner loop
Iteration 5, inertia 952063.062496
start iteration
done sorting
end inner loop
Iteration 6, inertia 949706.529402
start iteration
done sorting
end inner loop
Iteration 7, inertia 948041.168209
start iteration
done sorting
end inner loop
Iteration 8, inertia 946862.25672
start iteration
done sorting
end inner loop
Iteration 9, inertia 946018.650684
start iteration
done sorting
end inner loop
Iteration 10, inertia 945389.664991
start iteration
done sorting
end inner loop
Iteration 11, inertia 944884.084096
start iteration
done sortin

In [24]:
kmeans.labels_

array([ 2, 15, 10, ..., 19, 11,  7], dtype=int32)

In [25]:
map_index_vec = dict()

In [26]:
for i, word in enumerate(corpus_words):
    map_index_vec[i] = glove_model[word]

In [27]:
labels = list(set(kmeans.labels_ + 1))

In [28]:
for label in labels:
    map_index_vec[label+20000] = list(kmeans.cluster_centers_[label-1])

In [28]:
with open('map_index_vec.pkl', 'wb') as output:
    pickle.dump(map_index_vec, output, pickle.HIGHEST_PROTOCOL)

In [31]:
map_word_index = dict()

In [32]:
for i, word in enumerate(corpus_words):
    map_word_index[word] = i

In [33]:
for i, word in enumerate(remaining_words):
    map_word_index[word] = kmeans.labels_[i] + 20000

In [33]:
with open('map_word_index.pkl', 'wb') as output:
    pickle.dump(map_word_index, output, pickle.HIGHEST_PROTOCOL)

In [34]:
train_question_1 = []

In [35]:
data = pd.read_csv('../input/train_cleaned.csv')
data = data.dropna()

In [36]:
for sentence in data['question1']:
    vec = [map_word_index[word] for word in sentence.split(" ")]
    train_question_1.append(vec)

In [37]:
train_question_2 = []

In [38]:
for sentence in data['question2']:
    vec = [map_word_index[word] for word in sentence.split(" ")]
    train_question_2.append(vec)

In [39]:
with open('train_question_1.pkl', 'wb') as output:
    pickle.dump(train_question_1, output, pickle.HIGHEST_PROTOCOL)

In [40]:
with open('train_question_2.pkl', 'wb') as output:
    pickle.dump(train_question_2, output, pickle.HIGHEST_PROTOCOL)

In [3]:
labels = data['is_duplicate'].values

In [7]:
with open('train_labels.pkl', 'wb') as output:
    pickle.dump(labels, output, pickle.HIGHEST_PROTOCOL)

In [34]:
data = pd.read_csv('../input/test_cleaned.csv')
data = data.dropna()

In [35]:
test_question_1 = []
for sentence in data['question1']:
    vec = [map_word_index[word] for word in sentence.split(" ")]
    test_question_1.append(vec)

In [36]:
test_question_2 = []
for sentence in data['question2']:
    vec = [map_word_index[word] for word in sentence.split(" ")]
    test_question_2.append(vec)

In [37]:
with open('test_question_1.pkl', 'wb') as output:
    pickle.dump(test_question_1, output, pickle.HIGHEST_PROTOCOL)

In [38]:
with open('test_question_2.pkl', 'wb') as output:
    pickle.dump(test_question_2, output, pickle.HIGHEST_PROTOCOL)