In [5]:
!mkdir -p Rawdata
!google-drive-ocamlfuse Rawdata

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option


In [6]:
!wget https://raw.githubusercontent.com/livingbio/DeepLearningTutorial/master/raw_sentences.txt -P 'Rawdata/gliacloud text'

--2018-04-26 17:29:37--  https://raw.githubusercontent.com/livingbio/DeepLearningTutorial/master/raw_sentences.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2955731 (2.8M) [text/plain]
Saving to: ‘Rawdata/gliacloud text/raw_sentences.txt’


2018-04-26 17:30:00 (143 KB/s) - ‘Rawdata/gliacloud text/raw_sentences.txt’ saved [2955731/2955731]



In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np

# The probabilities of each bigram and trigram.
# A function ngram_probs to calculate these counts.
def ngram_probs(filename='raw_sentences.txt'):
    with open(filename,'r') as file:
        content_text = file.read()
    
    tokenizer = Tokenizer(lower=True, split=" ")
    tokenizer.fit_on_texts([content_text])
    vocab = tokenizer.word_index
    vocab_size = len(vocab)
    
    vocab_reverse = {}
    vocab_keys = list(vocab.keys())
    for vocab_word in vocab_keys:
        vocab_index = vocab[vocab_word]
        vocab_reverse[vocab_index] = vocab_keys[vocab_index-1]
            
    content_word_ids = tokenizer.texts_to_sequences([content_text])
    content_word_ids = content_word_ids[0]
    content_len = len(content_word_ids)
    
    unigram_probs = np.zeros([vocab_size])
    bigram_probs = np.zeros([vocab_size, vocab_size])
    trigram_probs = np.zeros([vocab_size, vocab_size, vocab_size])
    
    for i in range(content_len):
        # P(W_i)
        unigram_probs[content_word_ids[i]-1] += 1
        
        # P(W_i|W_i-1)
        if i>0:
            bigram_probs[content_word_ids[i]-1, content_word_ids[i-1]-1] += 1
        
        # P(W_i|W_i-2, W_i-1)
        if i-1>0:
            trigram_probs[content_word_ids[i]-1, content_word_ids[i-1]-1, content_word_ids[i-2]-1] += 1
    return unigram_probs, bigram_probs, trigram_probs, vocab, vocab_reverse

cnt1, cnt2, cnt3, vocab ,vocab_reverse = ngram_probs('E:/Raw data/test/raw_sentences.txt')
vocab_size = len(vocab)
print('Counting of bigram with "we", "are":', cnt2[(vocab['we']-1, vocab['are']-1)])

Counting of bigram with "we", "are": 283.0


In [3]:
# Return the distribute of the number of N-grams counting
def ngram_distr(cnts):
    distr = {}
    cnts = cnts.reshape(-1)
    for cnt in cnts:
        if distr.__contains__(cnt):
            distr[cnt] += 1
        else:
            distr[cnt] = 1
    return distr

unigram_distr = ngram_distr(cnt1)
bigram_distr = ngram_distr(cnt2)
trigram_distr = ngram_distr(cnt3)
cnt_sum = cnt1.sum()

def vocab_idx(word):
    return vocab[word]-1

# To calculate the probabilities of a word.
def prob1(target, cnt1=cnt1):
    return cnt1[target] / cnt_sum

p = prob1(vocab_idx('we'))
print('P(we) =', p)

# To calculate the probabilities of next word of a unigram.
def prob2(unigram, target, cnt1=cnt1, cnt2=cnt2):
    return cnt2[target, unigram] / cnt1[unigram]

p = prob2(vocab_idx('we'), vocab_idx('are'))
print('P(are|we) =', p)

# To calculate the probabilities of next word of a bigram.
def prob3(bigram, target, cnt2=cnt2, cnt3=cnt3):
    return cnt3[target, bigram[1], bigram[0]] / (cnt2[bigram[1], bigram[0]] + 1e-16)

p = prob3([vocab_idx('we'), vocab_idx('are')], vocab_idx('family'))
print('P(family|we,are) =', p)

P(we) = 0.019591140053225306
P(are|we) = 0.19977453901280295
P(family|we,are) = 0.004433696090286175


In [4]:
# Discount factor of the probability of the N-gram.
def prob_discount(r, distr, K_limit=5):
    dr = 1.0
    if r < K_limit:
        dr = (r + 1) / r * distr[r + 1] / distr[r]
    return dr

# Bigram back-off factor of Katz Smoothing
def backoff2(unigram):
    prob2_sum = 0.0
    prob1_sum = 0.0
    for word in range(len(vocab)):
        r = cnt2[word, unigram]
        if r>0:
            dr = prob_discount(r, bigram_distr)
            prob2_sum += dr * prob2(unigram, word)
            prob1_sum += prob1(word)
    return (1.0 - prob2_sum) / (1.0 - prob1_sum)

# Bigram Katz's back-off probobility
def prob2_Katz(unigram, target, cnt2=cnt2):
    r = cnt2[target, unigram]
    if r>0:
        return prob_discount(r, bigram_distr) * prob2(unigram, target)
    else:
        return backoff2(unigram) * prob1(target)

# Trigram back-off factor of Katz Smoothing
def backoff3(bigram):
    prob3_sum = 0.0
    prob2_sum = 0.0
    for word in range(len(vocab)):
        r = cnt3[word, bigram[1], bigram[0]]
        if r>0:
            dr = prob_discount(r, trigram_distr)
            prob3_sum += dr * prob3(bigram, word)
            prob2_sum += prob2_Katz(bigram[1], word)
    return (1.0 - prob3_sum) / (1.0 - prob2_sum)

# Trigram Katz's back-off probobility
def prob3_Katz(bigram, target, cnt3=cnt3):
    r = cnt3[target, bigram[1], bigram[0]]
    if r>0:
        return prob_discount(r, trigram_distr) * prob3(bigram, target)
    else:
        return backoff3(bigram) * prob2_Katz(bigram[1], target)

In [5]:
def predict_step(pre_words):
    prob3_max = 0.0
    word_max = ''
    for word in range(len(vocab)):
        this_prob3 = prob3_Katz(pre_words, word)
        if prob3_max < this_prob3 or prob3_max == 0.0:
            prob3_max = this_prob3
            word_max = word
    return word_max

# Predict the sentence by finding the max likelihood word.
def predict_max(starting, cnt2=cnt2, cnt3=cnt3):
    list_of_words = starting
    while list_of_words[-1]!='.' and len(list_of_words)<15:
        word_max = predict_step([list_of_words[-1], list_of_words[-2]])
        list_of_words.append(word_max)
    return [vocab_reverse[idx+1] for idx in list_of_words]

sent = predict_max([vocab_idx('we'), vocab_idx('are')])
assert sent[-1] == '.' or len(sent) <= 15
print(' '.join(sent))

we are going nt to know do this nt is it not 's the not


In [6]:
def beam_search_step(pre_words, beam_size):
    prob3_beam = [0.0]*beam_size
    words_beam = ['']*beam_size
    for word in range(len(vocab)):
        this_prob3 = prob3_Katz(pre_words, word)
        for idx in range(beam_size):
            if prob3_beam[idx] < this_prob3:
                prob3_temp = prob3_beam[idx]
                prob3_beam[idx] = this_prob3
                this_prob3 = prob3_temp
                word_temp = words_beam[idx]
                words_beam[idx] = word
                word = word_temp
            elif prob3_beam[idx] == 0.0:
                prob3_beam[idx] = this_prob3
                words_beam[idx] = word
            else:
                continue
    return words_beam, prob3_beam

# Predict the sentence by finding the beam search word.
def predict_beam(bigram, beam_size=4, sent_length=15, cnt2=cnt2, cnt3=cnt3):
    list_of_sentence = [bigram]
    list_of_prob_n = [0.0]
    while max([len(sentence) for sentence in list_of_sentence])<sent_length:
        total_prob_beam = []
        total_words_beam = []
        for sentence, prob_n in zip(list_of_sentence, list_of_prob_n):
            if sentence[-1]!='.':
                words_beam, prob_beam = beam_search_step([sentence[-1], sentence[-2]], beam_size)
                total_prob_beam += [prob_n + np.log(prob + 1e-16) for prob in prob_beam]
                total_words_beam += [sentence + [word] for word in words_beam]
            else:
                total_prob_beam += [prob_n]
                total_words_beam += [sentence]
        
        total_beam = zip(total_words_beam, total_prob_beam)
        total_beam=[(words, prob) for words, prob in zip(total_words_beam, total_prob_beam)]
        total_beam.sort(key = lambda t: t[1], reverse=True)
        
        #print('Sentence:', max([len(sentence) for sentence in list_of_sentence]))
        if len(list_of_sentence)<beam_size:
            list_of_sentence = [bigram]*beam_size
            list_of_prob_n = [0.0]*beam_size
        for idx in range(beam_size):
            sentence, prob_n = total_beam[idx]
            list_of_sentence[idx] = sentence
            list_of_prob_n[idx] = prob_n
            #print(prob_n, ' '.join(vocab_reverse[idx+1] for idx in sentence))
    return [[vocab_reverse[idx+1] for idx in sentence] for sentence in list_of_sentence]

for sent in predict_beam([vocab_idx('we'), vocab_idx('are')], beam_size=16):
    assert sent[-1] == '.' or len(sent) <= 15
    print(' '.join(sent))

we are going nt to work do i nt do want you to i do
we are going nt to work do i nt do want you to he do
we are going nt to work do i nt do want you to but do
we are going nt to work do i nt do want you to but me
we are going nt to work do i nt do want you to it be
we are going nt to work do i nt do know you what do you
we are going nt to know do this you is are it you 's and
we are going nt to know do this you is know that us 's against
we are going nt to work do i nt do know you what want 's
we are going nt to work do i nt do know you what want they
we are going nt to work do i nt do know you what do i
we are going nt to work do i nt do know you what are we
we are going nt to work do i nt do know you what do they
we are going nt to work do i nt do know you what want is
we are going nt to work do i nt do know you what want do
we are going nt to work do i nt do want you to but what
