In [1]:
Name ="Chakrya Ros"

In [6]:
from collections import Counter
import random
import numpy as np
import math
import matplotlib.pyplot as plt


class LanguageModel:
    def __init__(self, n_gram, is_laplace_smoothing, backoff=None):
        self.Ngram = n_gram
        self.freq = None
        self.word = None
        self.Numtokens = None
        self.smoothing = is_laplace_smoothing
        self.content = None
        self.bigram = {}
        self.unigramProb = {} # unigram probability dic
        self.bigramProb = {} # bigram probability dict
        
    def train(self, training_file_path):
        f = open(training_file_path, "r")
        self.content = f.read()
        f.close()
        self.word = self.content.split()
       
        #count the frequency word in dictionary
        self.freq = Counter(self.word)
       
        
#         self.freq['<UNK>'] = 0
       
        #assign <UNK> to the frequecy word that less than 2
        for key, value in list(self.freq.items()):
            if value == 1:
                self.freq['<UNK>'] += 1
                del self.freq[key]

        # bigram frequecy word        
        for i in range(len(self.word)-1):
            if self.word[i] not in list(self.freq):  #check if the first word not in frequency
                self.word[i] = '<UNK>'
            if self.word[i+1] not in list(self.freq): #check if the next word not in frequency
                self.word[i+1] = '<UNK>'
            if (self.word[i],self.word[i+1]) not in self.bigram: #check if the first and next word not in frequency
                self.bigram[(self.word[i],self.word[i+1])] = 1
            else:
                self.bigram[(self.word[i],self.word[i+1])] += 1
        
            #calculate probability of bigram words for generate sentence
            self.bigramProb[(self.word[i],self.word[i+1])] = self.bigram[(self.word[i],self.word[i+1])]/sum(self.bigram.values())

        #count the number of tokens
        self.Numtokens = sum(self.freq.values())
#         print("self.freq ",len(self.freq))
#         print(self.Numtokens)
        print("self.bigram ",self.bigram)
        
        #calculate probability of each word for generate sentence
        for word in self.word:
            self.unigramProb[word] = self.freq[word]/sum(self.freq.values())
         
        
        

    #helper function for calculate the bigram probability for each bigram
    def bigram_probability(self,bigram):
        prob = 0
        count_word = 0.0
        bottom = 0.0
        if self.smoothing:
            for i in range(len(bigram)-1):
                if (bigram[i], bigram[i+1]) not in self.bigram: #check if bigram word not in bigram frequecy
                    count_word = 1     #count word zero, just add_smooth
                else:
                    #if self_bigram has bigram word, get the values and add_smooth
                    count_word = self.bigram[bigram[i],bigram[i+1]] + 1
                if bigram[i] not in self.freq:  #check for single word
                    bottom = 1
                else:
                    bottom = self.freq[bigram[i]] + len(self.freq)
#             print("bigram {} {}".format(bigram, prob))
            prob = count_word/bottom
            
           
        else:
            for i in range(len(bigram)-1):
                if (bigram[i], bigram[i+1]) not in self.bigram: #check if bigram word not in bigram frequecy
                    prob = 0.0
#                 if (bigram[i+1]) not in self.freq:  #check for single word
#                     prob = 0.0
#                 if bigram[i] not in self.freq:
#                     prob = 0.0
                else:
                    prob = self.bigram[bigram]/self.freq[bigram[0]]
            print(prob)
        return prob
       
    #helper function for calculate the unigram probability for each word
    def unigram_probability(self,word):
        prob = 0
        if self.smoothing:
            prob = (self.freq[word] + 1)/(self.Numtokens + len(self.freq))
#             print("unigram {} {}".format(word, prob))
           
        else:
            prob = (self.freq[word])/(self.Numtokens)
#             print("unigram {} {}".format(word, prob))
           
        return prob  
        
    #helper function for generate unigram random
    def unigram_generate(self):
        start = '<s>'
        word_freqs = []
        word_freqs.append(start)
      
        #check if the word is not the end of sentence
        while start != '</s>':
            unigram_freqs = {}
            #loop through frequecy words
            for w in list(self.freq.keys()):
                if w != '<s>':
                    unigram_freqs[w] = self.unigramProb[w]  #calcutate the probability word
            total = sum(unigram_freqs.values())    # calcuate the total of frequence word values   
            for i in list(unigram_freqs.keys()):
                unigram_freqs[i] = unigram_freqs[i]/total
            #generate the random word in the range of probability
            random_word = random.choices(list(unigram_freqs.keys()), list(unigram_freqs.values()))
            word_freqs.append(random_word[0])
            while word_freqs[1] == '</s>':
                word_freqs.pop()
                random_word = random.choices(list(unigram_freqs.keys()), list(unigram_freqs.values()))
                word_freqs.append(random_word[0])
            start = random_word[0]
        word_freqs = ' '.join(word_freqs)
        return word_freqs
    
    def bigram_genterate(self):
        start = '<s>'
        sentence = []
        sentence.append('<s>')
        #check if the word is not the end of sentence
        while start != '</s>':
            bigram_freqs = {}
            #loop through frequecy words
            for word in list(self.bigramProb.keys()):
                if word[0] == start:
                    #calcutate the probability bigram word
                    bigram_freqs[word] = self.bigramProb[word]

#             total = sum(bigram_freqs.values())
#             for word in bigram_freqs:
#                 bigram_freqs[word] = bigram_freqs[word]/total
            #random select word from bigram frequence
            keys=np.array(list(bigram_freqs.keys()))
            prob=np.array(list(bigram_freqs.values()))
            prob/= prob.sum()
            index = np.random.choice(len(keys),1,p=prob)
            word=keys[index]
            word=keys[np.random.choice(len(keys),1,p=prob)]
#             word = random.choices(list(bigram_freqs.keys()), list(bigram_freqs.values()))
            sentence.append(word[0][1])
            start = word[0][1]

        sentence = " ".join(sentence)
        return sentence
   
            
    # generate the random sentence
    def generate(self, num_sentences):
        arr_sentences = []
        sentence = ''
        for _ in range(num_sentences):
            if self.Ngram == 1:
                sentence = self.unigram_generate() #generate the unigram sentence
                arr_sentences.append(sentence)
            else:
                sentence = self.bigram_genterate() #generate the bigram sentence
                arr_sentences.append(sentence)
        return arr_sentences
            

       
    #calcuate the probability of each sentence
    def score(self, sentence):
        prob =1.0
        sentence = sentence.split()
        #calculate the probability of unigram
        if self.Ngram == 1:
            for i in range(len(sentence)):
                if sentence[i] not in self.freq:
                    sentence[i] = '<UNK>'
            
            for word in sentence:
                prob = prob * self.unigram_probability(word)
                
#             print("unigra_score", sentence)
#             print("len(words)", len(sentence))
        #calculate the probability of bigram
        else:
            
            for i in range(len(sentence)-1):
                if sentence[i] not in self.freq:
                    sentence[i] = '<UNK>'
                if sentence[i+1] not in self.freq:
                    sentence[i+1] = '<UNK>'
                sentence[i] = (sentence[i], sentence[i+1])
            
            for w in sentence:
                prob = prob * self.bigram_probability(w)
                
        return prob
    
    #helper function to write probabiliy to file
    def prob_to_file(self, test_file, outfile):
        file = open(test_file)
        #read each line of file
        lines = file.readlines()
        lines.pop(-1)
        #open file to write probability to file
        out = open(outfile, 'w')
        for sentence in lines:
            #caculate the probability of unigram
            prob = self.score(sentence)
            out.write(str(prob) +"\n")
        file.close()
        out.close()
        
    def plot_histogram(self, test_file, my_test_set, savefile):
        hw_file_test = open(test_file)
        sentence_test = hw_file_test.readlines()
        prob_hw_test = []
        sentence_test.pop(-1)
        for sentence in sentence_test:
            prob_hw_test.append(self.score(sentence))
        
        myFile = open(my_test_set)
        sentence_mytest = myFile.readlines()
        prob_my_test = []
        sentence_mytest.pop(-1)
        for s in sentence_mytest:
            prob_my_test.append(self.score(s))
        hw_file_test.close()
        myFile.close()
        concate_list = prob_hw_test + prob_my_test
        overall_min = min(concate_list)
        #plot the histogram
        min_exponent = np.floor(np.log10(np.abs(overall_min)))
        plt.hist([prob_hw_test,prob_my_test],bins=np.logspace(np.log10(10**min_exponent),
                            np.log10(1.0)), label = ["hw2-test", "My test set"], stacked = True) 
        plt.title("The relative frequency of the probabilities of the test set")
        plt.xlabel("Probability of test set")
        plt.ylabel("Frequecy")
        plt.legend()
        plt.savefig(savefile,bbox_inches='tight')
        plt.show()
        
    #extra credit to calculate the perplexity
    def perplexity(self,test_sequence):
        file = open(test_sequence)
        word = file.read()
        sentences = file.readlines()
        word = word.split()
        NumWord = len(word)
        per_log_sum = 0
        perplex = 0
        for s in sentences:
            per_log_sum -= math.log(self.score(s),2)
        perplex = math.pow(2, (per_log_sum/NumWord))
        return perplex
            
           
        
def main():
    #Unigram model
    LM = LanguageModel(1, True, backoff=None)
#     LM.train('berp-training.txt')
    LM.train('hw2-minitest.txt')
    random_sentences = LM.generate(2)
    generate_file = open("final_unigram-generated.txt", 'w')
    for sentence in random_sentences:
        generate_file.write(str(sentence) + '\n')
    generate_file.close()
#     LM.prob_to_file("hw2-test.txt","hw2-unigram-out.txt")
#     LM.plot_histogram("hw2-test.txt","hw2-my-test.txt", "hw2-unigram-histogram.pdf")

    
    #bigram model
#     LM_bigram = LanguageModel(2, False, backoff=None)
#     LM_bigram.train('berp-training.txt')
#     LM_bigram.train('hw2-minitest.txt')
#     random_bigram = LM_bigram.generate(2)
#     print(LM_bigram.score('<s> sam i ham </s>'))
#     generate_bigram = open("hw2-bigram-generated.txt", 'w')
#     for sentence in random_bigram:
#         generate_bigram.write(str(sentence) + '\n')
#     generate_bigram.close()
#     LM_bigram.prob_to_file("hw2-test.txt","hw2-bigram-out.txt")
#     LM_bigram.plot_histogram("hw2-test.txt","hw2-my-test.txt", "hw2-bigram-histogram.pdf")
#     test_perplex = open("hw2-minitest.txt", 'r')
#     print(LM.perplexity("hw2-minitest.txt"))

        
        

                             
                
if __name__ == "__main__":
    
    main()
    


self.bigram  {('<s>', '<UNK>'): 2, ('<UNK>', 'is'): 1, ('is', 'close'): 2, ('close', '</s>'): 2, ('</s>', '<s>'): 5, ('<s>', 'I'): 4, ('I', 'am'): 4, ('am', 'excited'): 2, ('excited', 'for'): 2, ('for', '<UNK>'): 2, ('<UNK>', '</s>'): 2, ('am', 'sad'): 2, ('sad', '<UNK>'): 2, ('<UNK>', 'season'): 2, ('season', 'ended'): 1, ('ended', '</s>'): 2, ('<UNK>', 'ended'): 1, ('season', 'is'): 1, ('<UNK>', '<UNK>'): 1}


In [5]:
p = (2.5*3) + (-5*2) + (-1.2*1)+ (0.5*3) + (2*0)+(.7*4.15) + 0.1
print(p)


0.805


In [5]:
import numpy as np
print(1/(1+np.exp(-p)))

0.6910430124157229


In [7]:
x = [3,2,1,3,0,4.15]
w = []
for i in x:
    w.append((-0.31)*i)
print(w)

[-0.9299999999999999, -0.62, -0.31, -0.9299999999999999, -0.0, -1.2865000000000002]


In [14]:
#compute gradient
w_t = [2.5, -5, -1.2, 0.5,2,0.7]
gre = [-0.93, -0.62,-.31,-0.93, 0,-1.29]
w_t_1 = []
for i in range(len(w_t)):
   
    p = w_t[i]-gre[i]
        
    w_t_1.append(p)
print(w_t_1)
        

[3.43, -4.38, -0.8899999999999999, 1.4300000000000002, 2, 1.99]


In [18]:

feature = [3,2,1,3,0,4.15]
z = np.dot(feature,w_t_1) + 0.1
print(z)
print(1/(1+np.exp(-z)))

13.288500000000004
0.9999983061418364


In [2]:
#quiz 5
import numpy as np
import math
def sigmoid(x):
    return 1/(1 + math.exp(-x))

x = [1,1,3]
w=[0,-2,0.75]
b = 0.5
z = (np.dot(x,w)) + b
print(z)
p_y_1 = 1/(1+np.exp(-z))
print("P(y=1|x) = {:.04f}".format(p_y_1))

print(sigmoid(z))

0.75
P(y=1|x) = 0.6792
0.679178699175393


In [3]:
#calculate the value of 
#w_t+1 = w_t - learningrate*gradients
#gradient = [sigmoid(w*x+b)-y]x_j
def gradient(x,z,y=1):
    grad = []
    temp  = z-y
    for i in x:
        grad.append(temp*i)
    return grad
grad = gradient(x,p_y_1,1)
print("gradient = {}".format(grad))
w_t_1= []
for i in range(len(grad)):
    w_t_1.append(w[i]-(1*grad[i]))
print("w_t_1 = {}".format(w_t_1))

#calculate p_y_1 from update weight
z_new = np.dot(x,w_t_1)+b
print("p_y_1 from updat weight = {}".format(sigmoid(z_new)))

gradient = [-0.320821300824607, -0.320821300824607, -0.9624639024738211]
w_t_1 = [0.320821300824607, -1.679178699175393, 1.712463902473821]
p_y_1 from updat weight = 0.9863333296044556


In [4]:
p = sigmoid(z_new)
dz = p_y_1 - 1
b_1 = b - (1*dz)
print(b_1)
# print(((1/2)*dz))
z_new = (np.dot(x,w_t_1)) + b_1
print(sigmoid(z_new))

0.820821300824607
0.9900467753908733


In [11]:
x = [1,1,3]
w = [0,-2,0.75]
b = 0.5
z = (np.dot(x,w)) + b
p_y_1 = sigmoid(z)
def gradient(x,z,y=1):
    grad = []
    temp  = z-y
    for i in x:
        grad.append(temp*i)
    return grad

grad = gradient(x,p_y_1,1)

w_t_1_new = []
for i in range(len(grad)):
    w_t_1_new.append(w[i]-(0.1*grad[i]))
print("w_t_1_new = {}".format(w_t_1_new))

#calculate p_y_1 from update weight
z_new0 = np.dot(x,w_t_1_new)+0.5
print("p_y_1 from updat weight = {}".format(sigmoid(z_new0)))
p0 = sigmoid(z_new0)
dz = p_y_1 - 1
b_1_new = 0.5 - (0.1*dz)
print("b_1_new ", b_1_new)
# print(((1/2)*dz))
z_new_0 = np.dot(x,w_t_1_new) + b_1_new
print(1-sigmoid(z_new_0))
print(sigmoid(z_new_0))

w_t_1_new = [0.032082130082460705, -1.9679178699175393, 0.8462463902473821]
p_y_1 from updat weight = 0.7508037257111576
b_1_new  0.5320821300824607
0.2432422072374817
0.7567577927625183


In [3]:
import numpy as np
import math
x = [3, 0.75]
w = [[1, 0],
     [2, -3],
     [0, 0.5]]
b =[1, -0.5, -1]
z = []
def sigmoid(x):
    return 1/(1 + math.exp(-x))
for i in range(len(w)):
    z.append((np.dot(x,w[i])) + b[i])
print(z)


# p_y_1 = sigmoid(z)
# def gradient(x,z,y=1):
#     grad = []
#     temp  = z-y
#     for i in x:
#         grad.append(temp*i)
#     return grad

# grad = gradient(x,0.373,1)

# w_t_1_new = []
# for i in range(len(grad)):
#     w_t_1_new.append(w[i]-(0.1*grad[i]))
# print("w_t_1_new = {}".format(w_t_1_new))

[4.0, 3.25, -0.625]


TypeError: unsupported operand type(s) for -: 'list' and 'float'