# Latent Semantic Analysis

Latent Semantic Analysis (LSA) was basically the top of the line as far as word vectors were concerned before the word2vec model [cite] was developed. The long and short of it is that you do a term/document matrix and do a Singular-Value Decomposition (SVD) [cite] on it. The result removes variation from the character representations of words. 

## Baseline

Right now it doesn't make sense to compare it to anything with a concrete minimum matching. But if we hit an F1 score of 60, then I will consider this a viable option for detecting variation sets.

In [28]:
import utterances
import evaluation
import sys
import difflib
import collections
import codecs
from math import log
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import varseta_accuracy_tester as vat

In [3]:
args = ("anch", 3, 2)

to_dos = [
        ("DATA/Swedish_MINGLE_dataset/plain/1", "DATA/Swedish_MINGLE_dataset/GOLD/1"),
        ("DATA/Swedish_MINGLE_dataset/plain/2", "DATA/Swedish_MINGLE_dataset/GOLD/2"),
        ("DATA/Swedish_MINGLE_dataset/plain/3", "DATA/Swedish_MINGLE_dataset/GOLD/3"),
        ("DATA/Swedish_MINGLE_dataset/plain/4", "DATA/Swedish_MINGLE_dataset/GOLD/4")]


### Reading in data

Things to note:
I'm using only the lowercase versions, considering the corpus size, this can change

In [9]:
all_utterances = []
for to_do in to_dos:
    print("Reading in: " + to_do[0])
    u = utterances.Utterances(to_do[0], to_do[1])
    gold_utterances = u._goldutterances

    utterances_reformatted = []
    ids = []

    for utterance in u._utterances:
        new_utt = utterance[2].split()
        new_utt = [i.lower() for i in new_utt]
        utterances_reformatted.append(new_utt)
        ids.append((utterance[0], utterance[1]))
        
    all_utterances = all_utterances + utterances_reformatted

Reading in: DATA/Swedish_MINGLE_dataset/plain/1
Reading in: DATA/Swedish_MINGLE_dataset/plain/2
Reading in: DATA/Swedish_MINGLE_dataset/plain/3
Reading in: DATA/Swedish_MINGLE_dataset/plain/4


## Tf-idf

The first step is to build a tf-idf matrix. For our purposes, each utterance will be a document. This may change down the line.

In [5]:
def _dummy_preprocessor(to_return):
    """This is a workaround for the TfidfVectorizer's tokenizer"""
    return to_return

In [7]:
tf_idf = TfidfVectorizer(analyzer='word',
                         tokenizer=_dummy_preprocessor,
                         preprocessor=_dummy_preprocessor,
                         token_pattern=None)

In [78]:
tf_idf_features = tf_idf.fit_transform(all_utterances)
tf_idf.get_feature_names()

[u's\xe5', u'!']


### Create the SVD

Note, we can put all this in a pipeline, but I think it's a little more explecit if we just go through each one.

In [16]:
lsa = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=10, random_state=69)

lsa.fit_transform(tf_idf_features)

array([[ 9.51364892e-01, -2.31301547e-01,  1.16082492e-01, ...,
         1.25349359e-03, -1.37946403e-03,  5.15523863e-04],
       [ 1.46198434e-01,  2.32317490e-01,  7.58477293e-03, ...,
         5.23653005e-03,  1.57735245e-03, -1.13509976e-03],
       [ 4.49652796e-02,  7.92418302e-02,  9.63618728e-03, ...,
         1.67115632e-03,  3.24197397e-02,  1.27179032e-02],
       ...,
       [ 3.59692288e-01,  1.90604597e-02, -8.16234413e-02, ...,
        -1.79658585e-02, -1.33032014e-02, -5.86318483e-03],
       [ 4.20428058e-02,  1.31244813e-01, -1.60543821e-01, ...,
        -1.80693766e-03, -1.69740224e-01, -2.24321818e-02],
       [ 5.82134770e-02,  2.08614156e-01, -2.59514807e-01, ...,
         2.21563892e-02, -1.54063723e-01, -4.38398196e-03]])

In [73]:
# yes
test_a = lsa.transform(tf_idf.transform(["ja"]))
print(test_a)

# no
test_b = lsa.transform(tf_idf.transform([u'n\xe4']))

# maybe (according to an online dictionary)
test_c = lsa.transform(tf_idf.transform([u'kanske']))

print(cosine_similarity(test_a, test_b))
print(cosine_similarity(test_a, test_c))
print(cosine_similarity(test_b, test_c))
print(cosine_similarity(test_b, test_b))

[[ 1.29011349e-03  2.52727211e-03 -6.34374583e-04  5.55326881e-03
  -2.39245892e-04 -1.48188405e-03 -8.16205523e-04  8.42298315e-04
  -5.04169499e-03 -2.30353940e-06 -2.36536407e-03 -2.89080734e-03
  -1.18691833e-03 -3.04637112e-03  2.09389575e-03 -2.18931325e-04
  -3.96857868e-03  1.26094287e-02  6.75262106e-03  3.08682846e-03
   3.75317972e-03 -2.40704326e-03 -1.56736199e-03  2.42793815e-03
   8.20006228e-04 -1.04234446e-03  8.62725497e-03  2.07789017e-03
   2.97357059e-03 -5.56152844e-03 -4.86702876e-03 -2.18053561e-03
  -8.87283736e-04 -7.30840720e-04 -1.10542776e-03  2.74007910e-04
  -4.39758640e-03 -2.51087103e-04 -1.23747322e-03 -1.52310575e-03
  -4.99929199e-03 -7.37718676e-03 -1.84183931e-03  3.44890793e-03
   3.63043851e-03 -2.83421161e-03 -2.31805620e-03 -2.04820630e-03
   2.62852920e-02 -6.32734432e-03  8.95271579e-04  1.41285686e-03
  -4.29371697e-03 -7.01372590e-04 -1.13206355e-03 -6.24236235e-03
  -3.60416703e-03  5.59606565e-04 -7.31797130e-03  2.91480704e-03
   1.49369

In [86]:
# yes
test_a = lsa.transform(tf_idf.transform([u'ja', u'ja']))
print(test_a)

# no
test_b = lsa.transform(tf_idf.transform([u'n\xe4', u'n\xe4']))

[[ 1.29011349e-03  2.52727211e-03 -6.34374583e-04  5.55326881e-03
  -2.39245892e-04 -1.48188405e-03 -8.16205523e-04  8.42298315e-04
  -5.04169499e-03 -2.30353940e-06 -2.36536407e-03 -2.89080734e-03
  -1.18691833e-03 -3.04637112e-03  2.09389575e-03 -2.18931325e-04
  -3.96857868e-03  1.26094287e-02  6.75262106e-03  3.08682846e-03
   3.75317972e-03 -2.40704326e-03 -1.56736199e-03  2.42793815e-03
   8.20006228e-04 -1.04234446e-03  8.62725497e-03  2.07789017e-03
   2.97357059e-03 -5.56152844e-03 -4.86702876e-03 -2.18053561e-03
  -8.87283736e-04 -7.30840720e-04 -1.10542776e-03  2.74007910e-04
  -4.39758640e-03 -2.51087103e-04 -1.23747322e-03 -1.52310575e-03
  -4.99929199e-03 -7.37718676e-03 -1.84183931e-03  3.44890793e-03
   3.63043851e-03 -2.83421161e-03 -2.31805620e-03 -2.04820630e-03
   2.62852920e-02 -6.32734432e-03  8.95271579e-04  1.41285686e-03
  -4.29371697e-03 -7.01372590e-04 -1.13206355e-03 -6.24236235e-03
  -3.60416703e-03  5.59606565e-04 -7.31797130e-03  2.91480704e-03
   1.49369

## Try it out

Now that we have our processor, we can start to take a look at the sentences.

In [39]:
args = ("anch", 3, 2)

In [98]:
def cosine_similarity_matcher(a_vectors, b_vectors, similarity, minimum_matches):
    matches = 0
    
    for vector_a in a_vectors:
        for vector_b in b_vectors:
            if cosine_similarity(vector_a.reshape(-1, 1), vector_b.reshape(-1, 1))[0][0] > similarity:
                matches += 1
                
    if matches >= minimum_matches:
        return True
                
    return False
                

In [88]:
def matches_anchor_lsa(it, minimum_matches, match_type, overlap, return_count=True, ids=None):
    """Returns varation set matches using anchor method"""

    matches = 0
    matches_list = []

    for count, i in enumerate(it):
        utterances = iter(i)
        first = next(utterances)
        first_vector = lsa.transform(tf_idf.transform(first))
        
        for utterance in utterances:
            utterance_vector = lsa.transform(tf_idf.transform(utterance))
            if cosine_similarity_matcher(first_vector, utterance_vector, overlap, args[2]):
                matches += 1
                if ids:
                    matches_list.append((ids[count], i))
                else:
                    matches_list.append(i)

    if return_count:
        return matches
    else:
        return matches_list

In [101]:
fuzzy_precisions, strict_precisions, fuzzy_recalls, strict_recalls,\
            fuzzy_f1s, strict_f1s = [], [], [], [], [], []

similarity = 0.7

for to_do in to_dos:
    print("Finding variation sets in" + to_do[0])
    u = utterances.Utterances(to_do[0], to_do[1])
    gold_utterances = u._goldutterances

    utterances_reformatted = []
    ids = []

    for utterance in u._utterances:
        new_utt = utterance[2].split()
        
        # lowered again
        new_utt = [i.lower() for i in new_utt]
        utterances_reformatted.append(new_utt)
        ids.append((utterance[0], utterance[1]))

    utt_iter = vat.window(utterances_reformatted, args[2])
    id_iter = vat.window(ids, args[2])
    ids = [i for i in id_iter]
    ids_and_matches = matches_anchor_lsa(utt_iter, args[2], None, similarity, return_count=False, ids=ids)
    combined = vat.convert_varseta_format(ids_and_matches)

    varseta_eval = evaluation.Evaluation(combined, gold_utterances)

    fuzzy_precisions.append(varseta_eval.fuzzy_precision)
    strict_precisions.append(varseta_eval.strict_precision)
    fuzzy_recalls.append(varseta_eval.fuzzy_recall)
    strict_recalls.append(varseta_eval.strict_recall)
    fuzzy_f1s.append(varseta_eval.fuzzy_f1)
    strict_f1s.append(varseta_eval.strict_f1)

    print('\tFuzzy Precision: {:0.2f}'.format(varseta_eval.fuzzy_precision))
    print('\tFuzzy Recall: {:0.2f}'.format(varseta_eval.fuzzy_recall))
    print('\tFuzzy F1: {:0.2f}'.format(varseta_eval.fuzzy_f1))
    print('')
    print('\tStrict Precision: {:0.2f}'.format(varseta_eval.strict_precision))
    print('\tStrict Recall: {:0.2f}'.format(varseta_eval.strict_recall))
    print('\tStrict F1: {:0.2f}'.format(varseta_eval.strict_f1))
    print('\n')

Finding variation sets inDATA/Swedish_MINGLE_dataset/plain/1
Strict match F-score = 	0.0
Fuzzy match F-score = 	0.00585930517244


AttributeError: Evaluation instance has no attribute 'fuzzy_precision'