In [2]:
from collections import Counter
import math
def corpus_reader(path,limit_bytes=2*1024*1024*1024):
    read_bytes=0
    with open(path,'r',encoding='utf-8',errors='ignore') as f:
        for line in f:
            b=len(line.encode('utf-8'))
            if read_bytes+b>limit_bytes:
                break
            read_bytes+=b
            s=line.strip()
            if s:
                yield s
def tokenize(s):
    return s.split()
def prepare_tokens(s):
    return ['<s>']+tokenize(s)+['</s>']
def build_ngrams(tokens,n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
def build_counts(corpus,n):
    c=Counter()
    for sent in corpus:
        t=prepare_tokens(sent)
        c.update(build_ngrams(t,n))
    return c
def vocab_from_counts(unigram_counts):
    return set(w[0] for w in unigram_counts.keys())
def prob_unigram_addone(w,uni,V,total):
    return (uni[(w,)]+1)/(total+V)
def prob_unigram_addk(w,uni,V,total,k):
    return (uni[(w,)]+k)/(total+k*V)
def prob_unigram_tokentype(w,uni,token_types,total):
    return (uni[(w,)]+1)/(total+token_types)
def sentence_logprob_unigram(sentence,uni,V,total,method='addone',k=0.5,token_types=None):
    t=prepare_tokens(sentence)
    logp=0.0
    for w in t:
        if method=='addone':
            p=prob_unigram_addone(w,uni,V,total)
        elif method=='addk':
            p=prob_unigram_addk(w,uni,V,total,k)
        else:
            p=prob_unigram_tokentype(w,uni,token_types if token_types is not None else V,total)
        logp+=math.log(p)
    return logp
def evaluate(sentences_path,uni,V,total,out_path,k=0.5):
    with open(sentences_path,'r',encoding='utf-8',errors='ignore') as f, open(out_path,'w',encoding='utf-8') as out:
        out.write('sentence\tlogprob_addone\tlogprob_addk\tlogprob_tokentype\n')
        for line in f:
            s=line.strip()
            if not s:
                continue
            lp1=sentence_logprob_unigram(s,uni,V,total,'addone')
            lpk=sentence_logprob_unigram(s,uni,V,total,'addk',k=k)
            lpt=sentence_logprob_unigram(s,uni,V,total,'tokentype',token_types=V)
            out.write(s.replace('\t',' ')+'\t'+str(lp1)+'\t'+str(lpk)+'\t'+str(lpt)+'\n')
corpus_path='/home/deepakchalla/Desktop/NLP/Lab1/tokenized_output.txt'
sentences_path='sentences.txt'
limit_bytes=0.5*1024*1024*1024
k=0.5
uni=build_counts(corpus_reader(corpus_path,limit_bytes),1)
V=len(vocab_from_counts(uni))
total=sum(uni.values())
evaluate(sentences_path,uni,V,total,'results_unigram.tsv',k=k)
print('done')


done
