In [None]:
import gensim
from nltk import word_tokenize
from nltk.tokenize import casual_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from pprint import pprint
import pickle
import re

In [None]:
#future work?
contractions = {}
with open('english_contractions.txt', 'r') as infile:
    for line in infile:
        contraction, word = line.strip().split('|')
        contractions[contraction] = word
contractions

## Tokenizer methods

TODO: add parameter for the different methods
      add stemming or lemmatization

In [None]:
stoplist = set(stopwords.words('english'))
def is_english(word):
    #not quite, it accepts some latin characters
    try:
        word.encode('ISO-8859-1')
    except UnicodeEncodeError:
        return False
    return True

def is_number(word):
    #plus hexadecimal numbers and 1 letter + numbers
    return re.search(r'(^0x)|(^[0-9\-\.:xv]+$)|(^\w\d+$)', word)

def text_from_number(word):
    return re.search(r'^\d+([a-z]+)$', word)

def remove_punctuation(word):
    if not re.search(r'\.com?\b|\.in\b|\.org\b|\.be\b|\.xyz\b|\.net\b|\.us\b', word):
        new_words = []
        for term in word.split('/'):
            characters = set("!#$%&'()*+^,")
            new_word = term.translate({ord(char) : None for char in characters})
            delete_chars = "+_\-\."
            new_word = re.sub(r'^[{}]+|[{}]+$'.format(delete_chars, delete_chars), '', new_word)
            new_word = re.sub(r'^¿([^\W_])', r'\1', new_word) #deletes '¿' at the beginning of the word
            if new_word:
                new_words.append(new_word)
            
        return new_words
    return [re.sub(r'^[/\'\+]+|/$', '', word)]

def tokenize(doc, remove_stopwords=True):
    if remove_stopwords:
        words = [word.strip() for word in casual_tokenize(doc) if word.strip() not in stoplist]
    else:
        words = [word for word in word_tokenize(doc)]
    words = [token for word in words for token in remove_punctuation(word)]
    words = [word for word in words if len(word) > 1 and not is_number(word) and is_english(word)]
    words = [text_from_number(word).groups()[0] if text_from_number(word) else word for word in words]
    return words

In [None]:
#run Preprocess notebook first!
question_pairs = pickle.load(open('data/question_pairs.list.pkl', 'rb'))

## Reading questions and building vocabulary

In [None]:
#read unique questions from pairs
def process_questions(output_filename):
    questions = []
    seen_questions = set() #question to question id, to correct question ids (check corrected data file)
    vocabulary = set()

    with open(output_filename, 'w') as qt_file:
        for pair in question_pairs:
            question1, question2 = pair[3:5]

            token_q1 = tokenize(question1)
            token_q2 = tokenize(question2)

            qt_file.write('{}\n{}\n{}\n{}\n'.format(question1, ' '.join(token_q1),
                                                    question2, ' '.join(token_q2)))

            if question1 not in seen_questions:
                seen_questions.add(question1)
                vocabulary.update(token_q1)
                if token_q1:
                    questions.append(token_q1)

            if question2 not in seen_questions:
                seen_questions.add(question2)
                vocabulary.update(token_q2)
                if token_q2:
                    questions.append(token_q2)
                    
    return questions, vocabulary

%time questions, vocabulary = process_questions('data/questions_tokenized.txt')

pkl_file = open('data/lsa_vocabulary.set.pkl', 'wb')
pickle.dump(vocabulary, pkl_file)
pkl_file.close()

### Display purposes

In [None]:
with open('data/vocabulary.txt','w') as outfile:
    for word in vocabulary:
        outfile.write('{}\n'.format(word))

In [None]:
print('questions:', len(questions))
print('vocabulary:', len(vocabulary))

## Save files!

In [None]:
dictionary = gensim.corpora.Dictionary(questions)
dictionary.save('data/questions.dict')
print(dictionary)

In [None]:
questionsvectors = [dictionary.doc2bow(document) for document in questions]
gensim.corpora.MmCorpus.serialize('data/questions.mm', questionsvectors)
print(questionsvectors[:50])

### Example of usage

In [None]:
new_doc = "how can I stay motivated to learn a new language"
new_vec = dictionary.doc2bow(new_doc.split())
print(new_vec)

## Loading files

If saved files previously, they can be read from here

In [None]:
dictionary = gensim.corpora.Dictionary.load('data/questions.dict')
corpus = gensim.corpora.MmCorpus('data/questions.mm')

In [None]:
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

### Saving files

In [None]:
tfidf.save('data/questions.tfidf')

In [None]:
%time lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
lsa.save('data/questions.10d.lsa')
%time lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=100)
lsa.save('data/questions.100d.lsa')
%time lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
lsa.save('data/questions.200d.lsa')
%time lsa = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
lsa.save('data/questions.300d.lsa')

In [None]:
tfidf = gensim.models.TfidfModel.load('data/questions.tfidf')
lsa = gensim.models.LsiModel.load('data/questions.10d.lsa')

### Example

In [None]:
dictionary = gensim.corpora.Dictionary.load('data/questions.dict')
lsa10 = gensim.models.LsiModel.load('data/questions.10d.lsa')
doc1 = "what is npt? how would signing the npt affect india?"
doc2 = "what is npt?"
print(doc1)
print(doc2)
token_doc1 = tokenize(doc1)
token_doc2 = tokenize(doc2)
print(token_doc1)
print(token_doc2)
doc1_vec = lsa10[dictionary.doc2bow(token_doc1)]
doc2_vec = lsa10[dictionary.doc2bow(token_doc2)]
print(doc1_vec)
print(doc2_vec)
print(gensim.matutils.cossim(doc1_vec, doc2_vec))

## Evaluation

Using cosine similarity a file is generated to be evaluated later

In [None]:
from time import gmtime, strftime
#just to debug time

In [None]:
for ndim in [10, 100, 200, 300]:
    print('Evaluating {} dimensions'.format(ndim))
    filename = 'data/lsa_similarities_{}d.txt'.format(ndim)
    print('{} - Loading model...'.format(strftime('%H:%M:%S')))
    lsa = gensim.models.LsiModel.load('data/questions.{}d.lsa'.format(ndim))
    outfile = open(filename, 'w')
    errfile = None
    print('{} - Starting evaluation...'.format(strftime('%H:%M:%S')))
    
    for _, pair in enumerate(question_pairs):
        pair_id, qid1, qid2, doc1, doc2, label = pair
        token_doc1 = tokenize(doc1)
        token_doc2 = tokenize(doc2)
        
        doc1_vec = lsa[dictionary.doc2bow(token_doc1)]
        doc2_vec = lsa[dictionary.doc2bow(token_doc2)]
        similarity = gensim.matutils.cossim(doc1_vec, doc2_vec)
        
        outfile.write('{}|{}|{}\n'.format(pair_id, label, similarity))
        
    print('{} - Evaluation finished.'.format(strftime('%H:%M:%S')))
    outfile.close()

## Results here

In [None]:
outfile = open('data/lsa_results.txt', 'w')
for ndim in [10, 100, 200, 300]:
    filename = 'data/lsa_similarities_{}d.txt'.format(ndim)
    values_file = open('data/lsa_{}d_values.txt'.format(ndim), 'w')

    with open(filename) as infile:
        tp = fp = tn = fn = 0
        for line in infile:
            pair_id, label, similarity = line.split('|')
            pair_id = int(pair_id)
            label = int(label)
            similarity = float(similarity)

            predicted_label = 0 if similarity < 0.7 else 1

            if label == 1: #positive
                if label == predicted_label: #true positive
                    tp += 1
                else: #false negative
                    fn += 1
            else: #negatives
                if label == predicted_label: #true negative
                    tn += 1
                else: #false positive
                    fp += 1
            q1 = question_pairs[pair_id][3]
            q2 = question_pairs[pair_id][4]
            values_file.write('{} - {} - {} - {}\n{}\n{}\n==================\n'.format(pair_id, label, predicted_label, \
                                                                                     similarity, q1, q2))

        wstr = '\nLSA - {} dimensions\n'.format(ndim) +\
               '\t\t\tpredicted_no\t\tpredicted_yes\n'+\
               'actual_no\t\t    {}\t\t    {}\n'.format(tn, fp)+\
               'actual_yes\t\t    {}\t\t    {}\n\n'.format(fn, tp)
        outfile.write(wstr)
        accuracy = (tp + tn)/(tp + tn + fp + fn)
        precision = tp/(tp + fp)
        recall = tp/(tp + fn)

        outfile.write('accuracy: {0:.3f}\nprecision: {0:.3f}\nrecall: {0:.3f}\n'.format(accuracy, precision, recall))
outfile.close()