In [1]:
import csv
from nltk.corpus import stopwords
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
import keras
from scipy import spatial

Using TensorFlow backend.


In [2]:
class CsvLoader(object):
    def __init__(self,filename):
        self.file = open(filename, mode='r')

    def tokenizeCsv(self,sentences):
        t = Tokenizer()
        t.fit_on_texts(sentences)
        #encoded_words = t.texts_to_sequences(sentences)
        return t

    def getCodedWords(self):
        csv_reader = csv.DictReader(self.file,fieldnames=['Category','Title','Summary'])
        ignore_words = set(stopwords.words('english'))
        manual_stops = ['(',')']
        ignore_words = list(ignore_words)+list(set(manual_stops))

        sentences = []

        for row in csv_reader:
            words = text_to_word_sequence(row['Summary'])
            sentence = ""
            for w in words:
                sentence+=w+" "
            sentences.append(sentence);

        return self.tokenizeCsv(sentences), sentences

In [3]:
csvTrain = CsvLoader('./smallTrain.csv')
token, docs = csvTrain.getCodedWords()

In [4]:
print(docs[:4])

["reuters short sellers wall street's dwindling band of ultra cynics are seeing green again ", 'reuters private investment firm carlyle group which has a reputation for making well timed and occasionally controversial plays in the defense industry has quietly placed its bets on another part of the market ', 'reuters soaring crude prices plus worries about the economy and the outlook for earnings are expected to hang over the stock market next week during the depth of the summer doldrums ', 'reuters authorities have halted oil export flows from the main pipeline in southern iraq after intelligence showed a rebel militia could strike infrastructure an oil official said on saturday ']


In [5]:
# Create Tokenizer
t = Tokenizer()
# Create unique indexes for words
t.fit_on_texts(docs)
# Vocab size
vSize = len(t.word_index) + 1
# Turn each document into a bunch of indexes that map to words
encoded_docs = t.texts_to_sequences(docs)

In [6]:
print(encoded_docs[:4])

[[14, 522, 1632, 400, 2182, 1633, 1003, 4, 2183, 2184, 22, 1004, 1005, 256], [14, 707, 1634, 357, 3362, 222, 68, 30, 2, 3363, 8, 596, 460, 3364, 6, 3365, 1267, 3366, 5, 1, 827, 223, 30, 3367, 1635, 19, 3368, 7, 401, 358, 4, 1, 127], [14, 828, 325, 87, 461, 326, 40, 1, 195, 6, 1, 708, 8, 462, 22, 327, 3, 1268, 46, 1, 196, 127, 107, 72, 168, 1, 1006, 4, 1, 463, 1269], [14, 235, 33, 3369, 59, 3370, 2185, 21, 1, 709, 3371, 5, 710, 208, 44, 829, 1007, 2, 3372, 2186, 42, 328, 1008, 17, 59, 329, 24, 7, 209]]


In [7]:
# Knobs
MAX_SENTENCE_LENGTH = 20
EMBED_INPUT_DIM = vSize
EMBED_OUTPUT_DIM = 32

In [8]:
# Pad the docs to maintain input shape
padded_docs = keras.preprocessing.sequence.pad_sequences(encoded_docs, maxlen=MAX_SENTENCE_LENGTH, padding='post')

# Create model
model = keras.Sequential()
# Embedding layer to learn embeddings with input size = Max sentence length
model.add(keras.layers.Embedding(EMBED_INPUT_DIM,EMBED_OUTPUT_DIM,input_length=MAX_SENTENCE_LENGTH))
# Train model
model.compile('rmsprop','mse')

In [9]:
# Turn documents into a series of word embeddings using model
def encode_docs(in_docs):
    out = model.predict(in_docs)
    return out

In [10]:
# Turning docs into word embeddings (Change to add docs)
embedded_docs_by_words = encode_docs(padded_docs)
doc_codes = {}
for code,d in zip(embedded_docs_by_words,docs):
    doc_codes[d] = code

In [11]:
# Printing one word's embedding for the document
print(doc_codes[docs[0]][0])

[-0.01026853 -0.01127439  0.03110701  0.03032035  0.04276967  0.02096802
 -0.02921199  0.02517955 -0.02183627  0.03298967 -0.00423329  0.04074765
 -0.01566428 -0.04449521  0.00276446 -0.03780801 -0.04217818  0.00611117
 -0.01618359  0.03942638  0.02700111  0.03197521  0.0487999  -0.04115957
  0.04092618 -0.00229242  0.0067974  -0.03362022 -0.00906729 -0.02366866
 -0.0288954   0.00842815]


In [12]:
# Turns entire document into 1xEMBED_OUTPUT_DIM so it can be compare to a query embedding
def minimize_doc_embedding(doc_dict):
    new_dict = {}
    for key in doc_dict.keys():
        newArr = []
        for dim in range(EMBED_OUTPUT_DIM):
            #newArr = []
            sumArr = 0
            for i in range(MAX_SENTENCE_LENGTH):
                sumArr+=doc_dict[key][i][dim]
                #print('Document:',key,'number col:',dim,'number row:',i,'=',doc_dict[key][i][dim])
            newArr.append(float(sumArr)/float(MAX_SENTENCE_LENGTH))
            #print('Document:',key,'number col:',dim,'=',float(sumArr)/float(MAX_SENTENCE_LENGTH))
        new_dict[key] = newArr
    return new_dict

In [13]:
min_doc_codes = minimize_doc_embedding(doc_codes)

In [14]:
print(min_doc_codes[docs[0]])

[0.002910548448562622, 0.006346642877906561, 0.0018008933402597904, -0.001895968709141016, -0.0011425351724028588, 0.004518786631524563, -0.015242862701416015, 0.010690431483089924, 0.013549551274627447, 0.002522165235131979, 0.014493837021291256, 0.0167920027859509, -0.008100176695734262, 0.0021769657731056215, 0.0025395315140485765, -0.0002859458327293396, -0.016686940658837558, 0.007020661421120167, 0.004684101603925228, 0.006036215927451849, -0.010053078923374415, 0.009036272112280131, 0.010586696024984122, -0.009656254947185517, -0.007826289162039757, -0.00018333010375499726, 0.0060262314043939115, 0.008215367514640092, 0.006242496147751808, 0.006888321042060852, -0.022521489672362803, -0.006254567671567202]


In [15]:
# Function to turn query into a series of word indexes
def indexQuery(model,que,t):
    padded_queries = []
    newQueries = []
    for q in que:
        words = text_to_word_sequence(q)
        query=""
        for i,w in enumerate(words):
            query+=w+" "
        newQueries.append(query)
    codeQuery = t.texts_to_sequences(newQueries)
    padded_query = keras.preprocessing.sequence.pad_sequences(codeQuery, maxlen=MAX_SENTENCE_LENGTH, padding='post')
    #padded_queries.append(padded_queries)
    return padded_query

In [16]:
qList = ["retail sales bounced back claims jobless benefits fell week economy is improving slump"]
# Embed query
padQ = indexQuery(model,qList,t)
coded_qs = model.predict(padQ)

In [17]:
# Create dict of queries and codes
q_dict = {}
for q,code in zip(qList,coded_qs):
    q_dict[q] = code

In [18]:
def minimize_query_embedding(q_dict):
    new_dict = {}
    for q in q_dict.keys():
        newArr = []
        for dim in range(EMBED_OUTPUT_DIM):
            sumArr = 0
            for i in range(MAX_SENTENCE_LENGTH):
                sumArr+=q_dict[q][i][dim]
            newArr.append(float(sumArr)/float(MAX_SENTENCE_LENGTH))
            #print('Document:',key,'number col:',dim,'=',float(sumArr)/float(MAX_SENTENCE_LENGTH))
        new_dict[q] = newArr
    return new_dict

In [19]:
# Create query embedding
min_q_dict = minimize_query_embedding(q_dict)

In [20]:
# Obtains the document most similar to the query
def get_most_sim(q_sim_dict,d_sim_dict):
    for q in q_sim_dict.keys():
        min_dist = 100000000
        min_d = "None"
        for d in d_sim_dict.keys():
            #temp = cosine_similarity((q_sim_dict[q]),(d_sim_dict[d]))
            # Should switch to cosine sim
            temp = abs(spatial.distance.cosine(q_sim_dict[q],d_sim_dict[d]))
            if(temp<min_dist):
                min_dist = temp
                min_d = d
            #print('Distance metric:',temp,'with document:',d)
        print('Most similar to query:',q,"is document:",min_d)

In [21]:
get_most_sim(min_q_dict,min_doc_codes)

Most similar to query: retail sales bounced back claims jobless benefits fell week economy is improving slump is document: celebrity fashion is booming these webpreneurs are bringing it to main street 
