### Individual Contributions

Divinity: Generated GPT solutions 1 and 2, reviewed answers for ps4.\
Carlos: Generated answers for ps4, reviewed answers for GPT solution 1 and 2 errors.

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy


###### DATA PROCESSING STARTS #########


def load_corpus():
	#this loads the data from sample_corpus.txt
	with open('sample_corpus.txt','r',encoding="utf8") as f:
		corpus = f.read().replace('\n',' ')
	return corpus

def remove_infrequent_words(sents):
	word_counts = {}
	for s in sents:
		for w in s:
			if w in word_counts:
				word_counts[w] += 1
			else:
				word_counts[w] = 1

	threshold = 2
	filtered_sents = []
	for s in sents:
		new_s = []
		for w in s:
			if word_counts[w] < threshold:
				new_s.append('<UNKNOWN>')
			else:
				new_s.append(w)
		filtered_sents.append(new_s)
	return filtered_sents

def segment_and_tokenize(corpus):
	#make sure to run: 
	# pip install -U pip setuptools wheel
	# pip install -U spacy
	# python -m spacy download en_core_web_sm
	#in the command line before using this!

	#corpus is assumed to be a string, containing the entire corpus
	nlp = spacy.load('en_core_web_sm')
	tokens = nlp(corpus)
	sents = [[t.text for t in s] for s in tokens.sents if len([t.text for t in s])>1]
	sents = remove_infrequent_words(sents)
	sents = [['<START>']+s+['<END>'] for s in sents]
	return sents

def make_word_to_ix(sents):
	word_to_ix = {}
	num_unique_words = 0
	for sent in sents:
		for word in sent:
			if word not in word_to_ix:
				word_to_ix[word] = num_unique_words
				num_unique_words += 1


	return word_to_ix

def sent_to_onehot_vecs(sent,word_to_ix):
	#note: this is not how you would do this in practice! 

	vecs = []
	for i in range(len(sent)):
		word = sent[i]
		word_index = word_to_ix[word]

		vec = torch.zeros(len(word_to_ix), dtype=torch.float32,requires_grad=False)
		vec[word_index] = 1
		vecs.append(vec)

	return vecs

def vectorize_sents(sents,word_to_ix):
	one_hot_vecs = []
	for s in sents:
		one_hot_vecs.append(sent_to_onehot_vecs(s,word_to_ix))
	return one_hot_vecs

def get_data():
	corpus = load_corpus()
	sents = segment_and_tokenize(corpus)
	word_to_ix = make_word_to_ix(sents)

	vectorized_sents = vectorize_sents(sents,word_to_ix)

	vocab_size = len(word_to_ix)

	return vectorized_sents, vocab_size




###### DATA PROCESSING ENDS #########




###### RNN DEFINITION STARTS #########

class ElmanNetwork(nn.Module):

	def __init__(self, embedding_dim, vocab_size, hidden_state_dim):
		super().__init__()

		self.W_e = nn.Parameter(torch.rand((embedding_dim, vocab_size )))
		self.W_x = nn.Parameter(torch.rand((hidden_state_dim, embedding_dim )))
		self.W_h = nn.Parameter(torch.rand((hidden_state_dim, hidden_state_dim )))
		self.W_p = nn.Parameter(torch.rand((vocab_size, hidden_state_dim )))
		self.b = nn.Parameter(torch.rand((hidden_state_dim )))

	def initialize_hidden_state(self,shape):
		return torch.zeros(shape,dtype=torch.float32,requires_grad=False)


	def elman_unit(self,word_embedding,h_previous):
		return torch.sigmoid(torch.matmul(self.W_x,word_embedding)+torch.matmul(self.W_h,h_previous)+self.b)

	def embed_word(self,word):
		#word is a one-hot vector
		return torch.matmul(self.W_e,word)


	def single_layer_perceptron(self,h):
		s = torch.matmul(self.W_p,h)
		softmax = nn.Softmax(dim=0)
		return softmax(s)


	def forward(self,sent):
		h_previous = self.initialize_hidden_state(self.W_h.size(1))

		predictions = []
		for i in range(len(sent)-1):
			current_word = sent[i]

			current_word_embedding = self.embed_word(current_word)

			h_current = self.elman_unit(current_word_embedding,h_previous)

			prediction = self.single_layer_perceptron(h_current)
			predictions.append(prediction)

			h_previous = h_current

		return predictions


###### RNN DEFINITION ENDS #########


#### LOSS FUNCTION BEGINS #######

def word_loss(word_probs, word):
	#outcome is a one-hot vector
	prob_of_word = torch.dot(word_probs,word)
	return -1*torch.log(prob_of_word)

def sent_loss(predictions, sent):
	L = torch.tensor(0,dtype=torch.float32)

	num_words = len(predictions)

	for i in range(num_words):
		word_probs = predictions[i]
		observed_word = sent[i+1]
		L = L + word_loss(word_probs,observed_word)

	return L / num_words


##### LOSS FUNCTION ENDS #######


def train():
	
	vectorized_sents, vocab_size = get_data()
	
	num_epochs = 100

	hidden_state_dim = 20
	embedding_dim = 40
	learning_rate = 0.001

	elman_network = ElmanNetwork(embedding_dim,vocab_size,hidden_state_dim)

	optimizer = optim.SGD(elman_network.parameters(), lr=learning_rate)

	for i in range(num_epochs):

		total_loss = 0
		for s in vectorized_sents:
			optimizer.zero_grad()
			predictions = elman_network(s)
			loss = sent_loss(predictions,s)
			total_loss += loss.detach().numpy()

			loss.backward() 
			optimizer.step() 
		print(total_loss / len(vectorized_sents))


if __name__=='__main__':
	train()




    

7.8167482836162625
6.924606736341313
6.453371144059915
6.183020658109656
5.962648102027088
5.791767258620142
5.673519146262701
5.598330078652157
5.547210658375342
5.508623754558851
5.477973329841192
5.452972191662045
5.431961083052745
5.413609770194969
5.396978827337524
5.381506712832044
5.36690770034215
5.353051423547256
5.339884473810244
5.327380604719996
5.315531013958418
5.304326732673837
5.293754135543977
5.283789726717388
5.274404403552338
5.265561520753794
5.257216554191245
5.249321425979461
5.24183012109306
5.234703594955367
5.227897869761865
5.221380889595453
5.215125482885083
5.2091073409995845
5.203305375755732
5.197705162230449
5.192293787002564
5.187056861331115
5.181983846635675
5.177067373745405
5.1723003912211665
5.167679982688559
5.163198519711519
5.158849785675356
5.154625919476226
5.150519337965616
5.146529164386155
5.142653010478571
5.1388837560337395
5.135217027568338
5.131649933867718
5.12817925927627
5.124801960183149
5.121513893616259
5.11831133521382
5.11519174

### Problem 1

In [4]:
corpus = load_corpus()

In [3]:
segment_and_tokenize(corpus)

[['<START>',
  'Also',
  ',',
  'baby',
  'animals',
  'who',
  'are',
  '<UNKNOWN>',
  'by',
  'their',
  'parents',
  'are',
  'no',
  'longer',
  'kept',
  'in',
  'the',
  '<UNKNOWN>',
  "'s",
  '<UNKNOWN>',
  'because',
  'the',
  '<UNKNOWN>',
  'has',
  'found',
  'that',
  'it',
  'is',
  '<UNKNOWN>',
  'to',
  '<UNKNOWN>',
  'them',
  'into',
  'their',
  '<UNKNOWN>',
  'once',
  'they',
  '<UNKNOWN>',
  'up',
  'if',
  'they',
  'are',
  'kept',
  'away',
  'from',
  'their',
  'own',
  'species',
  '.',
  '<END>'],
 ['<START>',
  'Some',
  'people',
  'think',
  'that',
  'before',
  'becoming',
  'a',
  'god',
  ',',
  'he',
  'was',
  'a',
  'Chinese',
  '<UNKNOWN>',
  'and',
  'a',
  '<UNKNOWN>',
  'of',
  'a',
  '<UNKNOWN>',
  'god',
  '.',
  '<END>'],
 ['<START>',
  'The',
  'Grand',
  'Canyon',
  'is',
  'a',
  'very',
  'large',
  '<UNKNOWN>',
  'in',
  'Arizona',
  ',',
  'United',
  'States',
  '.',
  '<END>'],
 ['<START>',
  'It',
  'is',
  'a',
  'national',
  'par

This function tokenizes the text that was generated by corpus, filters out one word sentences and removes infrequent words while also replacing it with \<UNKNOWN\>.

### Problem 2

In [8]:
sents = [['The', 'dog', 'barked'],['cat', 'The', 'barked']]
word_to_ix = make_word_to_ix(sents)

make_word_to_ix returns a dictionary in which each unique index is a string whose value is an int that corresponds to its index. This index is generated depending on when it is first seen.

### Problem 3

In [9]:
vectorize_sents(sents,word_to_ix)

[[tensor([1., 0., 0., 0.]),
  tensor([0., 1., 0., 0.]),
  tensor([0., 0., 1., 0.])],
 [tensor([0., 0., 0., 1.]),
  tensor([1., 0., 0., 0.]),
  tensor([0., 0., 1., 0.])]]

What this code does is it vectorizes our words. It takes in a list of tokens such as sents. It then uses word_to_ix to identify at which position it must append a 1 at all other positions the values will remain at 0. By doing so we are actively vectorizing words.

### Problem 4

The word embedding is generated by multiplying the weight matrix by the word.  

### Problem 5

Mathematically elman_unit performs matrix multiplication on by taking the previous hidden state and multiplying it with the current embedding matrix. We will consider the first matrix to be M1 and the second matrix to be M2. we will iterate through the column of M1 c,j and multiply it by the rows of M2 i,c. Where c is the constant meaning that for M1 we will stay in row 1 when c is equal to 1 and we will iterate through the rows of M2 where the column stays constant where c is equal to 1. We will iterate through all rows and columns of our respective matrices and will return a new matrix as a result. When all matrix multiplication is done, we will then use the sigmoid function which in turn returns a tensor. 

### Problem 6

The single_layer_perceptron function takes in the hidden state after which it does matrix multiplication which we have described as multiplying the rows and columns between matrices. We then use the softmax function provided by pytorch. The softmax function is where we take the exponential to the input vector z_i and normalizes these values by dividing the sum of all the exponentials, for N number of exponentials. That is i= 1 .... N and z(z_1, ..., z_N) for all vectors in R^N

### Problem 7

We believe the Elman network enforces causality by making sure that it
is forwarding on the network in chronological order. In addition to this it uses Hidden States for context. So when it calls the current_word_embedding, elman_unit, and prediction we find it is maintaining the correct order and leads to the past context influencing future words.

### Problem 8

Well we know from the softmax function that the probability vector is generated using the set of words. So by using the softmax function we know that we must use the sum of all the exponentials for the previous words we have vectorized. Therefore the first word in the sentence influences the second word in the sentence, and even the third word in the sentence.

### Problem 9

The loss decreases over time and this means the probability distribution is becoming better at predicting the next word that appears. It originally started with a loss of 6.09 and at the end with a loss of 5.11

### Problem 10

changed learning rate from 0.01 to 0.001 when we did so we saw that we started with a loss of 6.35 and we ended with a loss of 4.89, we keep all other variables the same. If we return the value of the learning rate back to 0.001 and we change the hidden dimension from 20 to 40 we start with loss of 8.52 and ends up at 5.00. This time we change the embedding from 20 to 40 and we keep all original variables the same. We find that our loss starts at 7.81 and ended at 5.02. It makes sense why my learning rate was the most important hyperparameter. As what we use to optimize our model is SGD and as such the steps it takes matter. It is of note that if we have too large of a step then our loss would increase and not decrease.  

### Problem 11

We don't believe the model will be able to model the dependency as it is too far removed and depending on how random the words are it could lead to an incorrect solution. Another issue that could arise is that their are specific words that could appear more often and skew our probabilistic distribution. Examples of this are words like "the", "a", "or", "and", we do not believe our model handles these dependencies correctly. Especially with vanishing gradients, a large corpus will result in losss of gradient due to it being so small, therefore the the model will have trouble remembering context from the 3rd to 950th word.