Skip to content


add link to hw3 solutions
Browse files Browse the repository at this point in the history
  • Loading branch information
acocos committed Feb 16, 2018
1 parent 8f356df commit 39a2897
Show file tree
Hide file tree
Showing 2 changed files with 351 additions and 1 deletion.
2 changes: 1 addition & 1 deletion
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ Here are the deliverables that you will need to submit:
<div class="panel-body" markdown="1">

This assignment was worth 60 points total (30 code, 30 writeup). The rubic used for grading this homework is below. The code we used to test your `` scripts locally is available [here](downloads/hw3/
This assignment was worth 60 points total (30 code, 30 writeup). The rubic used for grading this homework is below. The code we used to test your `` scripts locally is available [here](downloads/hw3/, and the solution code is [here](downloads/hw3/

#### Code (30 points total)

Expand Down
350 changes: 350 additions & 0 deletions downloads/hw3/
Original file line number Diff line number Diff line change
@@ -0,0 +1,350 @@
import os
import csv
import subprocess
import re
import random
import numpy as np

def read_in_shakespeare():
'''Reads in the Shakespeare dataset processesit into a list of tuples.
Also reads in the vocab and play name lists from files.
Each tuple consists of
tuple[0]: The name of the play
tuple[1] A line from the play as a list of tokenized words.
tuples: A list of tuples in the above format.
document_names: A list of the plays present in the corpus.
vocab: A list of all tokens in the vocabulary.

tuples = []

with open('will_play_text.csv') as f:
csv_reader = csv.reader(f, delimiter=';')
for row in csv_reader:
play_name = row[1]
line = row[5]
line_tokens = re.sub(r'[^a-zA-Z0-9\s]', ' ', line).split()
line_tokens = [token.lower() for token in line_tokens]

tuples.append((play_name, line_tokens))

with open('vocab.txt') as f:
vocab = [line.strip() for line in f]

with open('play_names.txt') as f:
document_names = [line.strip() for line in f]

return tuples, document_names, vocab

def get_row_vector(matrix, row_id):
return matrix[row_id, :]

def get_column_vector(matrix, col_id):
return matrix[:, col_id]

def create_term_document_matrix(line_tuples, document_names, vocab):
'''Returns a numpy array containing the term document matrix for the input lines.
line_tuples: A list of tuples, containing the name of the document and
a tokenized line from that document.
document_names: A list of the document names
vocab: A list of the tokens in the vocabulary
Let n = len(document_names) and m = len(vocab).
td_matrix: A mxn numpy array where the number of rows is the number of documents
and each column corresponds to a token in the corpus. A_ij contains the
frequency with which word i occurs in document j.
vocab: A list containing the tokens being represented by each column.

vocab_to_id = dict(zip(vocab, range(0, len(vocab))))
docname_to_id = dict(zip(document_names, range(0, len(vocab))))

matrix = np.zeros([len(vocab), len(document_names)])

for document, tokens in line_tuples:
column_id = docname_to_id.get(document, None)
if column_id is None:
for word in tokens:
row_id = vocab_to_id.get(word, None)
if row_id is None:
matrix[row_id, column_id] += 1

return matrix

def create_term_context_matrix(line_tuples, vocab, context_window_size=1):
'''Returns a numpy array containing the term context matrix for the input lines.
line_tuples: A list of tuples, containing the name of the document and
a tokenized line from that document.
vocab: A list of the tokens in the vocabulary
Let n = len(vocab).
tc_matrix: A nxn numpy array where A_ij contains the frequency with which
word j was found within context_window_size to the left or right of
word i in any sentence in the tuples.
vocab: A list containing the tokens being represented by each column.

vocab_to_id = dict(zip(vocab, range(0, len(vocab))))

matrix = np.zeros([len(vocab), len(vocab)])

for document, tokens in line_tuples:
for idx in range(len(tokens)):
for wdx in range(1, context_window_size+1):
prev_word = tokens[idx - wdx] if idx-wdx >= 0 else None
next_word = tokens[idx + wdx] if idx+wdx < len(tokens) else None

row_id = vocab_to_id.get(tokens[idx], None)
if row_id is None:

if prev_word is not None:
column_id = vocab_to_id.get(prev_word, None)
if column_id is None:
matrix[row_id, column_id] += 1
if next_word is not None:
column_id = vocab_to_id.get(next_word, None)
if column_id is None:
matrix[row_id, column_id] += 1

return matrix

def create_PPMI_matrix(term_context_matrix):
'''Given a term context matrix, output a PPMI matrix.
See section 15.1 in the textbook.
Hint: Use numpy matrix and vector operations to speed up implementation.
term_context_matrix: A nxn numpy array, where n is
the numer of tokens in the vocab.
Returns: A nxn numpy matrix, where A_ij is equal to the
point-wise mutual information between the ith word
and the jth word in the term_context_matrix.

n = term_context_matrix.shape[0]
e = 1e-6
PPMI_matrix = np.zeros([n, n])
denominator = np.sum(term_context_matrix+e)

p_ab = (term_context_matrix + e) / denominator

p_a = np.sum(p_ab, axis=1)[:,None] # sum each row
p_b = np.sum(p_ab, axis=0)[None,:] # sum each col

denom_mat = np.ones(term_context_matrix.shape)
denom_mat /= p_a
denom_mat /= p_b

pmi_mat = np.log2(p_ab * denom_mat)
ppmi_mat = np.maximum(pmi_mat, 0.)
return ppmi_mat


def create_tf_idf_matrix(term_document_matrix):
'''Given the term document matrix, output a tf-idf weighted version.
See section 15.2.1 in the textbook.
Hint: Use numpy matrix and vector operations to speed up implementation.
term_document_matrix: Numpy array where each column represents a document
and each row, the frequency of a word in that document.
A numpy array with the same dimension as term_document_matrix, where
A_ij is weighted by the inverse document frequency of document h.

n = float(term_document_matrix.shape[1]) # Number of documents
num_docs_with = np.sum(term_document_matrix > 0, axis=1)
idf = np.log(n / num_docs_with)
return np.multiply(term_document_matrix, idf[:,np.newaxis])

# Alt solution:
# term_in_doc = np.minimum(term_document_matrix, 1.) # entry=1 if term is in doc, 0 otherwise
# df_vec = np.sum(term_in_doc, axis=1.)[:,None] # sum each row
# idf_vec = 1. / df_vec
# tf_idf = term_document_matrix * idf_vec
# return tf_idf

def compute_cosine_similarity(vector1, vector2):
'''Computes the cosine similarity of the two input vectors.
vector1: A nx1 numpy array
vector2: A nx1 numpy array
A scalar similarity value.

num =, vector2)
den1 = np.sqrt((vector1**2).sum())
den2 = np.sqrt((vector2**2).sum())
return num / (den1 * den2)

def compute_jaccard_similarity(vector1, vector2):
'''Computes the cosine similarity of the two input vectors.
vector1: A nx1 numpy array
vector2: A nx1 numpy array
A scalar similarity value.

num = np.minimum(vector1, vector2).sum()
den = np.maximum(vector1, vector2).sum()
return num / den

def compute_dice_similarity(vector1, vector2):
'''Computes the cosine similarity of the two input vectors.
vector1: A nx1 numpy array
vector2: A nx1 numpy array
A scalar similarity value.

num = 2. * np.minimum(vector1, vector2).sum()
den = (vector1 + vector2).sum()
return num / den

def rank_plays(target_play_index, term_document_matrix, similarity_fn):
''' Ranks the similarity of all of the plays to the target play.
document_names: List of document names, corresponding to
term_document_matrix columns (i.e. name of document corresponding to
term_document_matrix[:,i] is given by document_names[i])
target_play_index: The integer index of the play we want to compare all others against.
term_document_matrix: The term-document matrix as a mxn numpy array.
similarity_fn: Function that should be used to compared vectors for two
documents. Either compute_dice_similarity, compute_jaccard_similarity, or
A length-n list of strings corresponding to play names,
ordered by decreasing similarity to the play indexed by target_play_index

m, n = term_document_matrix.shape
sims = np.zeros(n)
v_tgt = get_column_vector(term_document_matrix, target_play_index)
for i in range(n):
v_doc = get_column_vector(term_document_matrix, i)
sims[i] = similarity_fn(v_tgt, v_doc)
sims_sort = np.argsort(-sims)
return sims_sort

def rank_words(target_word_index, matrix, similarity_fn):
''' Ranks the similarity of all of the words to the target word.
vocab: List of terms, corresponding to target_word_index rows (i.e. word corresponding
to target_word_index[i,:] is given by vocab[i])
target_word_index: The index of the word we want to compare all others against.
matrix: Numpy matrix where the ith row represents a vector embedding of the ith word.
similarity_fn: Function that should be used to compared vectors for two word
ebeddings. Either compute_dice_similarity, compute_jaccard_similarity, or
A length-n list of words, ordered by decreasing similarity to the
target word indexed by word_index

n, __ = matrix.shape
sims = np.zeros(n)
v_tgt = get_row_vector(matrix, target_word_index)
for i in range(n):
v_wrd = get_row_vector(matrix, i)
sims[i] = similarity_fn(v_tgt, v_wrd)
sims_sort = np.argsort(-sims)
return sims_sort

if __name__ == '__main__':
tuples, document_names, vocab = read_in_shakespeare()

print('Computing term document matrix...')
td_matrix = create_term_document_matrix(tuples, document_names, vocab)

print('Computing tf-idf matrix...')
tf_idf_matrix = create_tf_idf_matrix(td_matrix)

print('Computing term context matrix...')
tc_matrix = create_term_context_matrix(tuples, vocab, context_window_size=2)

print('Computing PPMI matrix...')
PPMI_matrix = create_PPMI_matrix(tc_matrix)

random_idx = random.randint(0, len(document_names)-1)
similarity_fns = [compute_cosine_similarity, compute_jaccard_similarity, compute_dice_similarity]
for sim_fn in similarity_fns:
print('\nThe 10 most similar plays to "%s" using %s are:' % (document_names[random_idx], sim_fn.__qualname__))
ranks = rank_plays(random_idx, td_matrix, sim_fn)
for idx in range(0, 10):
doc_id = ranks[idx]
print('%d: %s' % (idx+1, document_names[doc_id]))

word = 'juliet'
vocab_to_index = dict(zip(vocab, range(0, len(vocab))))
for sim_fn in similarity_fns:
print('\nThe 10 most similar words to "%s" using %s on term-context frequency matrix are:' % (word, sim_fn.__qualname__))
ranks = rank_words(vocab_to_index[word], tc_matrix, sim_fn)
for idx in range(0, 10):
word_id = ranks[idx]
print('%d: %s' % (idx+1, vocab[word_id]))

word = 'juliet'
vocab_to_index = dict(zip(vocab, range(0, len(vocab))))
for sim_fn in similarity_fns:
print('\nThe 10 most similar words to "%s" using %s on PPMI matrix are:' % (word, sim_fn.__qualname__))
ranks = rank_words(vocab_to_index[word], PPMI_matrix, sim_fn)
for idx in range(0, 10):
word_id = ranks[idx]
print('%d: %s' % (idx+1, vocab[word_id]))

0 comments on commit 39a2897

Please sign in to comment.