Skip to content

Commit

Permalink
adding stability code
Browse files Browse the repository at this point in the history
  • Loading branch information
Pia Sommerauer authored and Pia Sommerauer committed Jun 3, 2019
1 parent 3107792 commit 46e7f40
Show file tree
Hide file tree
Showing 85 changed files with 43,860 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -9,5 +9,6 @@ evaluation/category_graphs/*
evaluation/wordnet_distinction_level/*
data/*.txt
!data/toy*.txt
neighbor_stability/models/*


Binary file added neighbor_stability/.DS_Store
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
name = "dsmtoolkit"
320 changes: 320 additions & 0 deletions neighbor_stability/dsmtoolkit/build/lib/dsmtoolkit/load_models.py
@@ -0,0 +1,320 @@
from gensim.models import KeyedVectors
import numpy as np
from scipy.sparse import dok_matrix, csr_matrix
import heapq
import math



def load_word2vec_model(path):

matrix = KeyedVectors.load_word2vec_format(path, binary=True)
model = ('w2v', matrix)
return model




def load_hyperwords_vocab(path):

with open(path) as infile:
vocab = [line.strip() for line in infile if len(line) > 0]

wi_dict = dict([(w, i) for i, w in enumerate(vocab)])

return wi_dict, vocab



def load_hyperwords_model(path, mtype):

if mtype == 'sgns':
matrix = np.load(path+'.words.npy')
wi_dict, vocab = load_hyperwords_vocab(path+'.words.vocab')

elif mtype == 'svd':
matrix = np.load(path+'.ut.npy')
wi_dict, vocab = load_hyperwords_vocab(path+'.words.vocab')
# I don't know why, but they transpose the matrix in the hyperwords code:
matrix = matrix.T

elif mtype == 'ppmi':

# code from hyperowrds hyperwords/representations/matrix_serializer.py
loader = np.load(path+'.npz')
matrix = csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])
wi_dict, vocab = load_hyperwords_vocab(path+'.words.vocab')

model = ('hyper', matrix, wi_dict, vocab)

return model


def load_model(path, mtype):

if mtype == 'w2v':

model = load_word2vec_model(path)

elif mtype != 'w2v':

model = load_hyperwords_model(path, mtype)

return model


def represent_word2vec(model, word):

model_type, matrix = model

vec = matrix[word]
return vec



def represent_hyperwords(model, word):

model_type, matrix, wi_dict, vocab = model

if word in vocab:
vec = matrix[wi_dict[word]]

return vec



def represent(model, word):

# Right now: no normalization. We might want to experiment with it though.

model_type = model[0]


if model_type =='w2v':
matrix = model[1]
if word in matrix.vocab:
vec = represent_word2vec(model, word)
else:
vec = 'OOV'

elif model_type == 'hyper':
vocab = model[-1]
if word in vocab:
vec = represent_hyperwords(model, word)
else:
vec = 'OOV'
else:
vec = 'OOV'


return vec

def normalize_matrix(matrix):

# necessary for getting nearest neighbors

# normalization for np matrix (sgns and svd)
if type(matrix) == np.ndarray:

norm = np.sqrt(np.sum(matrix * matrix, axis=1))
matrix_norm = matrix / norm[:, np.newaxis]

# normalization for sparse matrix
else:
m2 = matrix.copy()
m2.data **= 2
norm = np.reciprocal(np.sqrt(np.array(m2.sum(axis=1))[:, 0]))
normalizer = dok_matrix((len(norm), len(norm)))
normalizer.setdiag(norm)
matrix_norm = normalizer.tocsr().dot(matrix)

return matrix_norm
# Nearest neighbors

def normalize_vector(vec):

"""
Calculate unit unit_vector
Input: vector (resulting from a calculation)
"""
if type(vec) == np.ndarray:
mag = math.sqrt(sum([pow(value, 2) for value in vec]))

unit_vec = []

for value in vec:
unit_vec.append(value/mag)
unit_vec = np.array(unit_vec)

else:

v2 = vec.copy()
v2 = v2.power(2)
norm = np.reciprocal(np.sqrt(np.array(v2.sum(axis=1))[:, 0]))
normalizer = dok_matrix((len(norm), len(norm)))
normalizer.setdiag(norm)
unit_vec = normalizer.tocsr().dot(vec)


return unit_vec



def get_nearest_neighbors(model, vec, n):

model_type = model[0]
matrix = model[1]

## Vector must always be normalized (results from a mean of
# not normalized vecs)

vec_norm = normalize_vector(vec)

if model_type == 'w2v':

# Word2vec code automatically normalized vectors in model

most_similar = matrix.similar_by_vector(vec_norm, topn = n)
most_similar = [(cos, word) for word, cos in most_similar]


elif model_type == 'hyper':

wi, iw = model[2:]

# normalize vectors:
matrix_norm = normalize_matrix(matrix)


if type(matrix) == np.ndarray:

scores = matrix_norm.dot(vec_norm)
most_similar = heapq.nlargest(n, zip(scores, iw))

elif type(matrix) != np.ndarray:

scores = matrix_norm.dot(vec_norm.T).T.tocsr()
most_similar = heapq.nlargest(n, zip(scores.data, [iw[i] for i in scores.indices]))

return most_similar


def get_cosine(vec1, vec2):

vec1_norm = normalize_vector(vec1)
vec2_norm = normalize_vector(vec2)

cos = np.dot(vec1_norm, vec2_norm)

return cos


def find_words_distance(model, cos, n):

mtype, matrix, wi_dict, vocab = model

vocab_clean = [word for word in vocab if word.isalpha()]

pairs = []
for word1 in vocab_clean:
for word2 in vocab_clean:
vec1 = represent(model, word1)
vec2 = represent(model, word2)
cos_pair = get_cosine(vec1, vec2)
if round(cos_pair, 2) == round(cos, 2):
pairs.append((word1, word2))
print(word1, word2)
if len(pairs) == n:
break
if len(pairs) == n:
break
print(pairs)

def test_cosine_pairs():
word1 = 'star'
word2 = 'talent'

word3 = 'shine'
word4 = 'eminent'
word5 = 'beam'
n = 5

vec1 = represent(model, word1)
vec2 = represent(model, word2)

vec3 = represent(model, word3)
vec4 = represent(model, word4)
vec5 = represent(model, word5)

cos1 = get_cosine(vec1, vec2)
cos2 = get_cosine(vec4, vec5)
cos3 = get_cosine(vec3, vec4)
cos4 = get_cosine(vec3, vec5)

print(f'Similarity of {word1} and {word2}: {cos1}')
print(f'Similarity of {word4} and {word5}: {cos2}')
print(f'Similarity of {word3} and {word4}: {cos3}')
print(f'Similarity of {word3} and {word5}: {cos4}')

n = 5
find_words_distance(model, cos1, n)

def test_model(path, name):

print(f'-----------{name}-------------')
model = load_model(path, 'sgns')

word1 = 'word'
word2 = 'sentence'
n = 10
vec = represent(model, word1)
vec2 = represent(model, word2)
cos = get_cosine(vec, vec2)
print(f'similarity between {word1} and {word2}: {cos}')
print()
most_similar = get_nearest_neighbors(model, vec, n)
for cos, word in most_similar:
print(word, cos)
print('-----------------------------')
print()






def main():

#path_wiki_hyp_sgns = '/Users/piasommerauer/Data/dsm/wikipedia_full/sgns_pinit1/sgns_pinit1/sgns_rand_pinit1'

#path_w2v = '/Users/piasommerauer/Data/dsm/word2vec/movies.bin'

#path_w2v = '/Users/piasommerauer/Data/dsm/word2vec/GoogleNews-vectors-negative300.bin'

#path_hyp_sgns = '/Users/piasommerauer/Data/dsm/hyperwords/wiki_300Mtok_rec/sgns_rand_pinit1/sgns_rand_pinit1'

#path_hyp_svd = '/Users/piasommerauer/Data/dsm/my_models/models/svd/svd'

#path_hyp_ppmi = '/Users/piasommerauer/Data/dsm/hyperwords/wiki_300Mtok_rec/pmi/pmi'

path_coha_1900_1 = '/Users/piasommerauer/Data/dsm/coha/coha_1900/sgns_rand_pinit1/sgns_rand_pinit1'
path_coha_1900_2 = '/Users/piasommerauer/Data/dsm/coha/coha_1900/sgns_rand_pinit2/sgns_rand_pinit2'
path_coha_1900_3 = '/Users/piasommerauer/Data/dsm/coha/coha_1900/sgns_rand_pinit3/sgns_rand_pinit3'


#model = load_model(path_w2v, 'w2v')
#model = load_model(path_hyp_sgns, 'sgns')
#model = load_model(path_hyp_svd, 'svd')
#model = load_model(path_wiki_hyp_sgns, 'sgns')

test_model(path_coha_1900_1, 'pinit1')
test_model(path_coha_1900_2, 'pinit2')
test_model(path_coha_1900_3, 'pinit3')





if __name__ == '__main__':

main()
Binary file not shown.
10 changes: 10 additions & 0 deletions neighbor_stability/dsmtoolkit/dsmtoolkit.egg-info/PKG-INFO
@@ -0,0 +1,10 @@
Metadata-Version: 1.0
Name: dsmtoolkit
Version: 0.1
Summary: Functions to work with distributional semantic models
Home-page: UNKNOWN
Author: pia
Author-email: UNKNOWN
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
8 changes: 8 additions & 0 deletions neighbor_stability/dsmtoolkit/dsmtoolkit.egg-info/SOURCES.txt
@@ -0,0 +1,8 @@
setup.py
dsmtoolkit/__init__.py
dsmtoolkit/load_models.py
dsmtoolkit.egg-info/PKG-INFO
dsmtoolkit.egg-info/SOURCES.txt
dsmtoolkit.egg-info/dependency_links.txt
dsmtoolkit.egg-info/not-zip-safe
dsmtoolkit.egg-info/top_level.txt
@@ -0,0 +1 @@

@@ -0,0 +1 @@

@@ -0,0 +1 @@
dsmtoolkit
1 change: 1 addition & 0 deletions neighbor_stability/dsmtoolkit/dsmtoolkit/__init__.py
@@ -0,0 +1 @@
name = "dsmtoolkit"

0 comments on commit 46e7f40

Please sign in to comment.