In [1]:
from collections import Counter
import math
def corpus_reader(path,limit_bytes=2*1024*1024*1024):
    read_bytes=0
    with open(path,'r',encoding='utf-8',errors='ignore') as f:
        for line in f:
            b=len(line.encode('utf-8'))
            if read_bytes+b>limit_bytes:
                break
            read_bytes+=b
            s=line.strip()
            if s:
                yield s
def tokenize(s):
    return s.split()
def prepare_tokens(s):
    return ['<s>']+tokenize(s)+['</s>']
def build_ngrams(tokens,n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
def build_counts(corpus,n):
    c=Counter()
    for sent in corpus:
        t=prepare_tokens(sent)
        c.update(build_ngrams(t,n))
    return c
def vocab_from_counts(unigram_counts):
    return set(w[0] for w in unigram_counts.keys())
def prob_bigram_addone(bg,bi,uni,V):
    h=bg[:-1]
    return (bi[bg]+1)/(uni[h]+V)
def prob_bigram_addk(bg,bi,uni,V,k):
    h=bg[:-1]
    return (bi[bg]+k)/(uni[h]+k*V)
def prob_bigram_tokentype(bg,bi,uni,token_types):
    h=bg[:-1]
    return (bi[bg]+1)/(uni[h]+token_types)
def sentence_logprob_bigram(sentence,bi,uni,V,method='addone',k=0.5,token_types=None):
    t=prepare_tokens(sentence)
    ngrams=build_ngrams(t,2)
    logp=0.0
    for ng in ngrams:
        if method=='addone':
            p=prob_bigram_addone(ng,bi,uni,V)
        elif method=='addk':
            p=prob_bigram_addk(ng,bi,uni,V,k)
        else:
            p=prob_bigram_tokentype(ng,bi,uni,token_types if token_types is not None else V)
        logp+=math.log(p)
    return logp
def evaluate(sentences_path,bi,uni,V,out_path,k=0.5):
    with open(sentences_path,'r',encoding='utf-8',errors='ignore') as f, open(out_path,'w',encoding='utf-8') as out:
        out.write('sentence\tlogprob_addone\tlogprob_addk\tlogprob_tokentype\n')
        for line in f:
            s=line.strip()
            if not s:
                continue
            lp1=sentence_logprob_bigram(s,bi,uni,V,'addone')
            lpk=sentence_logprob_bigram(s,bi,uni,V,'addk',k=k)
            lpt=sentence_logprob_bigram(s,bi,uni,V,'tokentype',token_types=V)
            out.write(s.replace('\t',' ')+'\t'+str(lp1)+'\t'+str(lpk)+'\t'+str(lpt)+'\n')
corpus_path='/home/deepakchalla/Desktop/NLP/Lab1/tokenized_output.txt'
sentences_path='sentences.txt'
limit_bytes=0.5*1024*1024*1024
k=0.5
uni=build_counts(corpus_reader(corpus_path,limit_bytes),1)
bi=build_counts(corpus_reader(corpus_path,limit_bytes),2)
V=len(vocab_from_counts(uni))
evaluate(sentences_path,bi,uni,V,'results_bigram.tsv',k=k)
print('done')


done
