In [104]:
#this notebook takes the Kaggle question data and processes it,
#removing rare words from the questions and replacing them with
#'unknown word' tokens

#the 'unknown' tokens are numbered within each question pair, so we
#can tell if the unknown words are the same or different

import numpy as np 
import pandas as pd
import os, copy

from keras.preprocessing import text
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.models import Sequential

from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras.optimizers import RMSprop
from keras.engine.topology import Merge

train_df = pd.read_csv('train_proc.csv',encoding='utf-8').fillna("")
train_df.head()

X = train_df[['question1','question2']]
y = train_df['is_duplicate']

q1s = list(X['question1'].apply(lambda x: x.encode('utf-8'))) 
q2s = list(X['question2'].apply(lambda x: x.encode('utf-8')))
all_questions = q1s + q2s

tok = text.Tokenizer()
tok.fit_on_texts(all_questions)

In [105]:
freq_words = [word for word in tok.word_index if tok.word_counts[word]>100]

In [110]:
import string

punc = string.punctuation.translate(None,"'/") #we allow apostrophes (for the word "'s") and slashes
punc

'!"#$%&()*+,-.:;<=>?@[\\]^_`{|}~'

In [111]:
def process_qs(x1,x2):
    
    x1 = x1.lower().translate(None,punc).split(' ')
    x2 = x2.lower().translate(None,punc).split(' ')
    
    ctr=0
    seen=[]
    
    for i,word in enumerate(x1):
        if word not in freq_words:
            
            if word not in seen:
                ctr=ctr+1
                seen.append(word)
                x1[i]='unknownword' + str(ctr)
            else:
                x1[i] = 'unknownword' + str(seen.index(word)+1)
            
    for i, word in enumerate(x2):
        if word not in freq_words:
            
            if word not in seen:
                ctr=ctr+1
                seen.append(word)
                x2[i]='unknownword' + str(ctr)
            else:
                x2[i] = 'unknownword' + str(seen.index(word) + 1)

    return x1,x2    

In [112]:
test_qs = ['What is the meaning of fnarps? Where can a snergle buy fnarps?', 'Why do snergles talk about fnarps so much? Why are snergles so sqift?']
print (process_qs(*test_qs))

(['what', 'is', 'the', 'meaning', 'of', 'unknownword1', 'where', 'can', 'a', 'unknownword2', 'buy', 'unknownword1'], ['why', 'do', 'unknownword3', 'talk', 'about', 'unknownword1', 'so', 'much', 'why', 'are', 'unknownword3', 'so', 'unknownword4'])


In [113]:
seen = ['fnarps','snergle']
seen.index('fnarps')

0

In [114]:
processed_qs = [process_qs(i,j) for i,j in zip(q1s,q2s)]

In [115]:
processed_qs[1]

(['what',
  'is',
  'the',
  'story',
  'of',
  'unknownword1',
  'unknownword1',
  'diamond'],
 ['what',
  'would',
  'happen',
  'if',
  'the',
  'indian',
  'government',
  'unknownword2',
  'the',
  'unknownword1',
  'unknownword1',
  'diamond',
  'back'])

In [116]:
print(q1s[1],q2s[1])

('What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?')


In [117]:
processed_qs_str = [[' '.join(k) for k in pair] for pair in processed_qs]

In [118]:
processed_qs_str[0]

['what is the step by step guide to invest in share market in india',
 'what is the step by step guide to invest in share market']

In [119]:
df = pd.DataFrame(processed_qs_str)
df.columns = ['q1','q2']

In [120]:
df.head()

Unnamed: 0,q1,q2
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...
1,what is the story of unknownword1 unknownword1...,what would happen if the indian government unk...
2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...
3,why am i mentally very lonely how can i solve it,find the remainder when unknownword1 unknownwo...
4,which one unknownword1 in water unknownword2 s...,which fish would survive in salt water


In [121]:
df.columns = ['question1_uk','question2_uk']
df = df.join(train_df)
df.head()

Unnamed: 0,question1_uk,question2_uk,id,qid1,qid2,question1,question2,is_duplicate
0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,what is the story of unknownword1 unknownword1...,what would happen if the indian government unk...,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,why am i mentally very lonely how can i solve it,find the remainder when unknownword1 unknownwo...,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[ / math]...,0
4,which one unknownword1 in water unknownword2 s...,which fish would survive in salt water,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [122]:
df[['question1_uk','question2_uk','is_duplicate']].to_csv('training_unknowns.csv')

In [123]:
len(freq_words)

5578

In [132]:
#now do test set

test_df = pd.read_csv('test_proc.csv',encoding='utf-8').fillna("")
X = test_df[['question1','question2']]

q1s = list(X['question1'].apply(lambda x: x.encode('utf-8'))) 
q2s = list(X['question2'].apply(lambda x: x.encode('utf-8')))

processed_qs = [process_qs(i,j) for i,j in zip(q1s,q2s)]

In [133]:
processed_qs_str = [[' '.join(k) for k in pair] for pair in processed_qs]

In [135]:
df = pd.DataFrame(processed_qs_str)
df.columns = ['q1','q2']

df.to_csv('test_unknowns.csv',index=False)

In [131]:
len(processed_qs)

404290