In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('nlp_hackathon_search.csv')

In [3]:
df.columns

Index(['document', 'question', 'title', 'document_id'], dtype='object')

In [4]:
df.sample(10)

Unnamed: 0,document,question,title,document_id
60077,The Victorian era in particular became notorio...,Mid-18th century London had how many domestic ...,Child_labour,428e353a1bfe11ea8f2b656571b1b549
34497,"The governments in Berlin, Bremen and Hamburg ...",The parliament in Berlin is called what?,States_of_Germany,4291083c1bfe11ea8f2b656571b1b549
20242,The Allies offered peace terms in the Frankfur...,The Allies suggested peace terms in which set ...,Napoleon,428aa9421bfe11ea8f2b656571b1b549
12238,"By 1878, because of the growing popularity of ...",What were the names of the four avenues that s...,"Atlantic_City,_New_Jersey",4271fb541bfe11ea8f2b656571b1b549
51698,"Before the war, many observers believed the US...",What equipment did the Iraqi army possess?,Military_history_of_the_United_States,4271001e1bfe11ea8f2b656571b1b549
61219,"In November 1945, Eisenhower returned to Washi...",What city did Eisenhower notably visit in 1945?,Dwight_D._Eisenhower,427c16521bfe11ea8f2b656571b1b549
61894,"Dante Alighieri's Divine Comedy, written in th...",What is one of Boccaccio's works that helped p...,Late_Middle_Ages,4273c6aa1bfe11ea8f2b656571b1b549
39772,The 1903 advent of heavier-than-air fixed-wing...,What did the Imperial Japanese Navy Wakamiya c...,Aircraft_carrier,428a6cac1bfe11ea8f2b656571b1b549
61864,The origins of the people of Tuvalu are addres...,How many years ago did migrations of people ha...,Tuvalu,42929cba1bfe11ea8f2b656571b1b549
56269,YouTube entered into a marketing and advertisi...,"When did youtube launch the version of ""shows""...",YouTube,4298b9741bfe11ea8f2b656571b1b549


In [5]:
from tensorflow.contrib import learn

  from ._conv import register_converters as _register_converters


In [6]:
import numpy as np

In [7]:
text = df['question']
document_id = df['document_id']

In [8]:
import re
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9 ]", " ", string)
    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [9]:
text = [clean_str(str(t)) for t in text]

In [10]:
text

['what kind of scores did twilight princess receive from many video game review sources',
 'what is the date that cyrpus attained independence',
 'how long did lord salisbury remain as prime minister',
 'what is the purpose of antibiotic treatment',
 'what other species can be seen close to the shores of norfolk island',
 'whit what donors does zinc form stable complexes',
 'how many companies did a judge say infringed on dr moustakas s prior blue light patent in 2015',
 'when was peterson born',
 'in what country is butrint',
 'what are british prime ministers part of that grants them the title right honourable',
 'in what pattern is data stored on a cd',
 'what was believed would happen if the chinese entered the conflict',
 'what was the result of a child having an african mother',
 'the bombers used what out of desperation',
 'which street runs from fdr drive to eleventh avenue',
 'what type of motor was used in trailway traction applications',
 'what serves as the collective head 

In [11]:
max_doc_length = max([len(x.split(' ')) for x in text])

In [12]:
max_doc_length

40

In [13]:
vocab_processor = learn.preprocessing.VocabularyProcessor(max_doc_length)

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.


In [14]:
vocab = np.array(list(vocab_processor.fit_transform(text)))

Instructions for updating:
Please use tensorflow/transform or tf.data.


In [15]:
vocab_dictionary = vocab_processor.vocabulary_._mapping

In [16]:
list(vocab_dictionary.values())[-1]

31305

In [17]:
class Node:
    def __init__(self, size):
        self.doc_id = None
        self.lst = None
        
class Trie:
    def __init__(self, dictionary):
        self.vocab_size = list(vocab_dictionary.values())[-1]
        self.trienode = [Node(self.vocab_size)] * self.vocab_size
        self.vocab_dictionary = dictionary
        
    def getIdxs(self,string):
        idxs = []
        for w in string.split():
            idxs.append(self.vocab_dictionary[w])
        return idxs
        
    def insert(self, idxs, document_id):
        trienode = self.trienode
        for i in range(len(idxs)):
            if i == len(idxs) - 1:
                trienode[idxs[i]].doc_id = document_id
            trienode[idxs[i]].lst =  [Node(self.vocab_size)] * self.vocab_size
            trienode = trienode[idxs[i]].lst
    
    def find(self,idxs):
        trienode = self.trienode
        for i in range(len(idxs)):
            if i == len(idxs) - 1:
                doc_id = trienode[idxs[i]].doc_id
                return doc_id
            else:
                trienode = trienode[idxs[i]].lst

In [18]:
trie = Trie(vocab_dictionary)

In [19]:
for t,d in zip(text[:100],document_id[:100]):
    print(t)
    idxs = trie.getIdxs(t)
    print(idxs)
    trie.insert(idxs,d)

what kind of scores did twilight princess receive from many video game review sources
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
what is the date that cyrpus attained independence
[1, 15, 16, 17, 18, 19, 20, 21]
how long did lord salisbury remain as prime minister
[22, 23, 5, 24, 25, 26, 27, 28, 29]
what is the purpose of antibiotic treatment
[1, 15, 16, 30, 3, 31, 32]
what other species can be seen close to the shores of norfolk island
[1, 33, 34, 35, 36, 37, 38, 39, 16, 40, 3, 41, 42]
whit what donors does zinc form stable complexes
[43, 1, 44, 45, 46, 47, 48, 49]
how many companies did a judge say infringed on dr moustakas s prior blue light patent in 2015
[22, 10, 50, 5, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
when was peterson born
[65, 66, 67, 68]
in what country is butrint
[63, 1, 69, 15, 70]
what are british prime ministers part of that grants them the title right honourable
[1, 71, 72, 28, 73, 74, 3, 18, 75, 76, 16, 77, 78, 79]
in what pattern is data stor

In [20]:
test_text = ['Who assumed total control of Egypt?','what are british prime ministers part of that grants them the title right honourable']

In [23]:
for t in test_text:
    t = clean_str(t)
    print(t)
    print(trie.find(trie.getIdxs(t)))

who assumed total control of egypt
None
what are british prime ministers part of that grants them the title right honourable


TypeError: 'NoneType' object is not subscriptable

In [None]:
doc