In [None]:
import gensim
from nltk import word_tokenize
from nltk.tokenize import casual_tokenize
from nltk.corpus import stopwords
from pprint import pprint
import pickle
import re

In [None]:
#future work?
contractions = {}
with open('english_contractions.txt', 'r') as infile:
    for line in infile:
        contraction, word = line.strip().split('|')
        contractions[contraction] = word
contractions

## Tokenizer methods

TODO: add parameter for the different methods
      add stemming or lemmatization

In [None]:
stoplist = set(stopwords.words('english'))
def is_english(word):
    #not quite, it accepts some latin characters
    try:
        word.encode('ISO-8859-1')
    except UnicodeEncodeError:
        return False
    return True

def is_number(word):
    #plus hexadecimal numbers and 1 letter + numbers
    return re.search(r'(^0x)|(^[0-9\-\.:xv]+$)|(^\w\d+$)', word)

def text_from_number(word):
    return re.search(r'^\d+([a-z]+)$', word)

def remove_punctuation(word):
    if not re.search(r'\.com?\b|\.in\b|\.org\b|\.be\b|\.xyz\b|\.net\b|\.us\b', word):
        new_words = []
        for term in word.split('/'):
            characters = set("!#$%&'()*+^,")
            new_word = term.translate({ord(char) : None for char in characters})
            delete_chars = "+_\-\."
            new_words.append(re.sub(r'^[{}]+|[{}]+$'.format(delete_chars, delete_chars), '', new_word))
        return new_words
    return [re.sub(r'/$', '', word)]

def tokenize_question(doc):
    words = [word for word in word_tokenize(doc) if word not in stoplist]
    words = [token for word in words for token in remove_punctuation(word)] #remove some /
    words = [word for word in words if len(word) > 1 and not is_number(word) and is_english(word)]
    words = [text_from_number(word).groups()[0] if text_from_number(word) else word for word in words]
    return words

## Reading questions and building vocabulary

In [None]:
#run Preprocess notebook first!
question_pairs = pickle.load(open('data/question_pairs.list.pkl'))

In [None]:
documents = []
#doc2qid = {} #question to question id, to correct question ids (check corrected data file)
vocabulary = set()
qt_filename = 'data/questions_tokenized.txt'

with open(qt_filename, 'w') as qt_file:
    for pair in question_pairs:
        question1, question2 = pair[3:5]

        token_q1 = tokenize_question(question1)
        token_q2 = tokenize_question(question2)

        qt_file.write('{}\n{}\n{}\n{}\n'.format(question1, ' '.join(token_doc1),
                                                question2, ' '.join(token_doc2)))

        if doc1 not in doc2qid:
            doc2qid[doc1] = qid1
            vocabulary.update(token_doc1)
            if doc1:
                documents.append(token_doc1)

        if doc2 not in doc2qid:
            doc2qid[doc2] = qid2
            vocabulary.update(token_doc2)
            if doc2:
                documents.append(token_doc2)

pkl_file = open('data/lsa_vocabulary.set.pkl', 'wb')
pickle.dump(vocabulary, pkl_file)
pkl.close()

### Display purposes

In [None]:
with open('data/vocabulary.txt','w') as outfile:
    for word in vocabulary:
        outfile.write(word+'\n')

In [None]:
pprint(vocabulary)
print(len(documents))
print('documents:', len(documents))
print('vocabulary:', len(vocabulary),'\n', vocabulary)

## Save files!

In [None]:
dictionary = gensim.corpora.Dictionary(documents)
dictionary.save('data/questions.dict')
print(dictionary)

In [None]:
vectors = [dictionary.doc2bow(document) for document in documents]
gensim.corpora.MmCorpus.serialize('data/questions.mm', vectors)
print(vectors[:100])

### Example of usage

In [None]:
new_doc = "how can I stay motivated to learn a new language"
new_vec = dictionary.doc2bow(new_doc.split())
print(new_vec)

## Loading files

If saved files previously, they can be read from here

In [None]:
dictionary = gensim.corpora.Dictionary.load('data/questions.dict')
corpus = gensim.corpora.MmCorpus('data/questions.mm')

In [None]:
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

### Saving files

In [None]:
tfidf.save('data/questions.tfidf')

In [None]:
lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
lsa.save('data/questions.10d.lsa')
lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
lsa.save('data/questions.100d.lsa')
lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
lsa.save('data/questions.200d.lsa')
lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
lsa.save('data/questions.300d.lsa')
#corpus_lsa = lsa[corpus_tfidf]

In [None]:
tfidf = gensim.models.TfidfModel.load('data/questions.tfidf')
lsa = gensim.models.LsiModel.load('data/questions.10d.lsa')

In [None]:
question_pairs = pickle.load(open('data/question_pairs.pkl','rb'))
lsa10 = gensim.models.LsiModel.load('data/questions.10d.lsa')

### Example

In [None]:
doc1 = "what is npt? how would signing the npt affect india?"
doc2 = "what is npt?"
print(doc1)
print(doc2)
token_doc1 = tokenize_question(doc1)
token_doc2 = tokenize_question(doc2)
print(token_doc1)
print(token_doc2)
doc1_vec = lsa10[dictionary.doc2bow(token_doc1)]
doc2_vec = lsa10[dictionary.doc2bow(token_doc2)]
print(doc1_vec)
print(doc2_vec)
print(gensim.matutils.cossim(doc1_vec, doc2_vec))

## Evaluation

Using cosine similarity a file is generated to be evaluated later

In [None]:
from time import gmtime, strftime
#just to debug time
#TODO: check magic commands

### Serial implementation (kinda slow)

In [None]:
for ndim in [10, 100, 200, 300]:
    print('Evaluating {} dimensions'.format(ndim))
    filename = 'data/lsa_similarities_{}d.txt'.format(ndim)
    print('{} - Loading model...'.format(strftime('%H:%M:%S')))
    lsa = gensim.models.LsiModel.load('data/questions.{}d.lsa'.format(ndim))
    outfile = open(filename, 'w')
    errfile = None
    print('{} - Starting evaluation...'.format(strftime('%H:%M:%S')))
    
    for _, pair in enumerate(question_pairs):
        pair_id, qid1, qid2, doc1, doc2, label = pair
        token_doc1 = tokenize_question(doc1)
        token_doc2 = tokenize_question(doc2)
        
        doc1_vec = lsa[dictionary.doc2bow(token_doc1)]
        doc2_vec = lsa[dictionary.doc2bow(token_doc2)]
        similarity = gensim.matutils.cossim(doc1_vec, doc2_vec)
        
        outfile.write('{}|{}|{}\n'.format(pair_id, label, similarity))
        
    print('{} - Evaluation finished.'.format(strftime('%H:%M:%S')))
    outfile.close()

### Parallel implementation

TODO: finish this

In [None]:
import multiprocessing as mp

for ndim in [10, 100, 200, 300]:
    print('Evaluating {} dimensions'.format(ndim))
    filename = 'data/lsa_similarities_{}d.txt'.format(ndim)
    print('{} - Loading model...'.format(strftime('%H:%M:%S')))
    lsa = gensim.models.LsiModel.load('data/questions.{}d.lsa'.format(ndim))
    outfile = open(filename, 'w')
    errfile = None
    print('{} - Starting evaluation...'.format(strftime('%H:%M:%S')))
    
    for pair in question_pairs:
        pair_id, qid1, qid2, doc1, doc2, label = pair
        token_doc1 = tokenize_question(doc1)
        token_doc2 = tokenize_question(doc2)
        
        doc1_vec = lsa[dictionary.doc2bow(token_doc1)]
        doc2_vec = lsa[dictionary.doc2bow(token_doc2)]
        similarity = gensim.matutils.cossim(doc1_vec, doc2_vec)

    outfile.write('{}|{}|{}\n'.format(pair_id, label, similarity))

    print('{} - Evaluation finished.'.format(strftime('%H:%M:%S')))
    outfile.close()

## Results here