In [29]:
from __future__ import division
from math import log,sqrt
import operator

# Hack to shut up deprecation warning wrt something in the stemmer
import sys, importlib
sys.modules['sklearn.externals.six'] = importlib.import_module('six')

from nltk.stem import *
from nltk.stem.porter import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from load_map import *
from asgn2 import *

from scipy.spatial.distance import cosine as weighted_cosine_dist

file_positive = 'positive.txt'
file_negative = 'negative.txt'

positive_words = np.loadtxt(file_positive, dtype='U')
negative_words = np.loadtxt(file_negative, dtype='U')
positive_words = set([tw_stemmer(word) for word in positive_words]) #stemming might create duplicates; remove them
negative_words = set([tw_stemmer(word) for word in negative_words])
#find IDs of the words

positive_word_ids = []
for word in positive_words:
    if word in word2wid.keys():
        positive_word_ids.append(word2wid[word])
    
    
negative_word_ids = []
for word in negative_words:
    if word in word2wid.keys():
        negative_word_ids.append(word2wid[word])

In [30]:
#import the stemmer
STEMMER = PorterStemmer()

#Define the cosine similarity measure
def cos_sim_hpca(v0,v1):
  '''Compute the cosine similarity between two sparse vectors.

  :type v0: dict
  :type v1: dict
  :param v0: first vector
  :param v1: second vector
  :rtype: float
  :return: cosine between v0 and v1
  '''
  # We recommend that you store the sparse vectors as dictionaries
  # with keys giving the indices of the non-zero entries, and values
  # giving the values at those dimensions.

  
  return np.dot(v0,v1)/(np.sum(v0**2)*np.sum(v1**2))**(0.5)

# Helper Functions

In [31]:
def read_counts(filename, wids):
    '''Reads the counts from file. It returns counts for all words, but to
    save memory it only returns cooccurrence counts for the words
    whose ids are listed in wids.

    :type filename: string
    :type wids: list
    :param filename: where to read info from
    :param wids: a list of word ids
    :returns: occurence counts, cooccurence counts, and tot number of observations
    '''
    o_counts = {} # Occurence counts
    co_counts = {} # Cooccurence counts
    fp = open(filename)
    N = float(next(fp))
    for line in fp:
        line = line.strip().split("\t")
        wid0 = int(line[0])
        o_counts[wid0] = int(line[1])
        if(wid0 in wids):
            co_counts[wid0] = dict([int(y) for y in x.split(" ")] for x in line[2:])

    return (o_counts, co_counts, N)

def get_o_counts(filename):
    '''Reads the counts from file. It returns counts for all words, but to
    save memory it only returns cooccurrence counts for the words
    whose ids are listed in wids.

    :type filename: string
    :type wids: list
    :param filename: where to read info from
    :param wids: a list of word ids
    :returns: occurence counts, cooccurence counts, and tot number of observations
    '''
    o_counts = {} # Occurence counts
    co_counts = {} # Cooccurence counts
    fp = open(filename)
    N = float(next(fp))
    for line in fp:
        line = line.strip().split("\t")
        wid0 = int(line[0])
        o_counts[wid0] = int(line[1])

    return o_counts


def print_sorted_pairs(similarities, o_counts, first=0, last=100):
    '''Sorts the pairs of words by their similarity scores and prints
    out the sorted list from index first to last, along with the
    counts of each word in each pair.

    :type similarities: dict 
    :type o_counts: dict
    :type first: int
    :type last: int
    :param similarities: the word id pairs (keys) with similarity scores (values)
    :param o_counts: the counts of each word id
    :param first: index to start printing from
    :param last: index to stop printing
    :return: none
    '''
    if first < 0: last = len(similarities)
    for pair in sorted(similarities.keys(), key=lambda x: similarities[x], reverse = True)[first:last]:
        word_pair = (wid2word[pair[0]], wid2word[pair[1]])
        print("{:.2f}\t{:30}\t{}\t{}".format(similarities[pair],str(word_pair),
                                             o_counts[pair[0]],o_counts[pair[1]]))

def freq_v_sim(sims):
    xs = []
    ys = []
    for pair in sims.items():
        ys.append(pair[1])
        c0 = o_counts[pair[0][0]]
        c1 = o_counts[pair[0][1]]
        xs.append(min(c0,c1))
    plt.clf() # clear previous plots (if any)
    plt.xscale('log') #set x axis to log scale. Must do *before* creating plot
    plt.plot(xs, ys, 'k.') # create the scatter plot
    plt.xlabel('Min Freq')
    plt.ylabel('Similarity')
    print("Freq vs Similarity Spearman correlation = {:.2f}".format(spearmanr(xs,ys)[0]))
    #  plt.show() #display the set of plots

def make_pairs(items):
    '''Takes a list of items and creates a list of the unique pairs
    with each pair sorted, so that if (a, b) is a pair, (b, a) is not
    also included. Self-pairs (a, a) are also not included.

    :type items: list
    :param items: the list to pair up
    :return: list of pairs

    '''
    return [(x, y) for x in items for y in items if x < y]

### Function to obtain HPCA vectors from word id

In [32]:
file_name = 'U_N=15000_K=200.npy'
U = np.load(file_name)
print(U.shape)

file_name = 'S_N=15000_K=200.npy'
S = np.load(file_name)
print(S.shape)
print(S)


#sort the words in the twitter dataset by frequency
o_counts = get_o_counts("/afs/inf.ed.ac.uk/group/teaching/anlp/lab8/counts")
sorted_o_counts = dict(sorted(o_counts.items(), key=operator.itemgetter(1),reverse=True))

#find ranking of words
positive_words_ranks = []
for ID in positive_word_ids:
    if ID in sorted_o_counts.keys():
        positive_words_ranks.append(list(sorted_o_counts.keys()).index(ID))

negative_words_ranks = []
for ID in negative_word_ids:
    if ID in sorted_o_counts.keys():
        negative_words_ranks.append(list(sorted_o_counts.keys()).index(ID))
        
def wid2hpca_vec(wid):
    if wid in sorted_o_counts.keys():
        rank = list(sorted_o_counts.keys()).index(wid)
    else:
        print('Invalid ID: ', wid)
        return None
    
    if rank < U.shape[0]: #checking that the word is in the U matrix
        return U[rank,:] #return the embedding vector
    else:
        print('Rare word with ID: ', wid)
        return None
    
def create_hpca_vectors(wids):
    '''Creates embedding vectors for the words in wids, using HPCA.
    These should be sparse vectors.

    K (int) number of dimensions of the embedded vectors
    '''
    vectors = {}
    for wid0 in wids:
      vectors[wid0] = wid2hpca_vec(wid0)
      
    return vectors

(15000, 200)
(200,)
[59.23344383 18.02161332 12.40352844 10.53883405  9.23316225  8.59590141
  8.21714143  7.23592476  6.51846968  6.42240474  5.45478696  5.17993906
  5.0902209   4.87143112  4.62890412  4.3660962   4.21239223  4.07724987
  3.76047     3.57847436  3.48636012  3.43120839  3.38955165  3.09019564
  3.00403568  2.97747017  2.93763361  2.87980145  2.79411418  2.70629938
  2.61784133  2.59807637  2.54585305  2.46848535  2.40942928  2.39809242
  2.31908218  2.29499994  2.28342287  2.15622733  2.12661859  2.08923084
  2.07182863  2.05475217  2.0065745   1.96285993  1.9450299   1.92592624
  1.87101424  1.84219514  1.81460442  1.80599726  1.7886071   1.72904761
  1.70643462  1.69022358  1.67830548  1.65846842  1.61214321  1.59778202
  1.59591548  1.57495838  1.56345064  1.54069811  1.53185726  1.52619551
  1.51180912  1.49699631  1.48844418  1.47232016  1.46508851  1.44917173
  1.42280127  1.41700712  1.40155454  1.39244696  1.38824712  1.3823981
  1.36163994  1.35392525  1.3500

In [42]:
test_words = ["cat", "dog", "horse", "technology", "politics", "history"]
stemmed_words = [tw_stemmer(w) for w in test_words]
all_wids = set([word2wid[x] for x in stemmed_words])

#print(word2wid.keys())
# you could choose to just select some pairs and add them by hand instead
# but here we automatically create all pairs 
wid_pairs = make_pairs(all_wids)


#read in the HPCA vectors:
vectors = create_hpca_vectors(all_wids)


# compute cosine similarites for all pairs we consider
c_sims = {(wid0,wid1): 1-weighted_cosine_dist(vectors[wid0],vectors[wid1],S) for (wid0,wid1) in wid_pairs}

#print("Sort by cosine similarity")
print_sorted_pairs(c_sims, o_counts)

0.25	('cat', 'dog')                	169733	287114
0.18	('dog', 'hors')               	287114	57011
0.15	('polit', 'technolog')        	77492	71947
0.14	('histori', 'technolog')      	128279	71947
0.13	('cat', 'hors')               	169733	57011
0.11	('cat', 'histori')            	169733	128279
0.00	('dog', 'histori')            	287114	128279
-0.03	('histori', 'polit')          	128279	77492
-0.05	('hors', 'histori')           	57011	128279
-0.06	('cat', 'technolog')          	169733	71947
-0.15	('hors', 'polit')             	57011	77492
-0.15	('dog', 'polit')              	287114	77492
-0.16	('cat', 'polit')              	169733	77492
-0.21	('dog', 'technolog')          	287114	71947
-0.22	('hors', 'technolog')         	57011	71947


In [43]:
test_words = ["cat", "dog", "horse", "technology", "politics", "history"]
stemmed_words = [tw_stemmer(w) for w in test_words]
all_wids = set([word2wid[x] for x in stemmed_words])

#print(word2wid.keys())
# you could choose to just select some pairs and add them by hand instead
# but here we automatically create all pairs 
wid_pairs = make_pairs(all_wids)


#read in the HPCA vectors:
vectors = create_hpca_vectors(all_wids)

def np_cosine_sim(vec1, vec2):
    return np.dot(vec1, vec2)/(np.sum(vec1**2)*np.sum(vec2**2))**(0.5)

# compute cosine similarites for all pairs we consider
c_sims = {(wid0,wid1): np_cosine_sim(vectors[wid0],vectors[wid1]) for (wid0,wid1) in wid_pairs}

#print("Sort by cosine similarity")
print_sorted_pairs(c_sims, o_counts)

0.25	('cat', 'dog')                	169733	287114
0.07	('polit', 'technolog')        	77492	71947
0.07	('dog', 'hors')               	287114	57011
0.06	('histori', 'polit')          	128279	77492
0.06	('histori', 'technolog')      	128279	71947
0.02	('cat', 'histori')            	169733	128279
0.01	('cat', 'hors')               	169733	57011
-0.01	('cat', 'technolog')          	169733	71947
-0.01	('hors', 'polit')             	57011	77492
-0.04	('dog', 'histori')            	287114	128279
-0.05	('hors', 'histori')           	57011	128279
-0.06	('cat', 'polit')              	169733	77492
-0.10	('dog', 'polit')              	287114	77492
-0.12	('dog', 'technolog')          	287114	71947
-0.13	('hors', 'technolog')         	57011	71947
