In [1]:
import torch
torch.__version__

'1.8.1'

In [2]:
xlmr = torch.hub.load('pytorch/fairseq:main', 'xlmr.large')

Using cache found in /home/ke.na/.cache/torch/hub/pytorch_fairseq_main
2021-12-11 07:50:29 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2021-12-11 07:50:29 | INFO | fairseq.file_utils | loading archive file http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz from cache at /home/ke.na/.cache/torch/pytorch_fairseq/3f864e15bb396f062dd37494309dbc4238416edd1f8ef293df18b1424813f2fe.cf46c7deb6b9eaa3e47c17b9fc181669c52bc639c165fbc69166a61487662ac9
2021-12-11 07:50:35 | INFO | fairseq.tasks.multilingual_masked_lm | dictionary: 250001 types


In [3]:
xlmr.eval()

RobertaHubInterface(
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (sentence_encoder): TransformerEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(250002, 1024, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 1024, padding_idx=1)
        (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0): TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (d

In [8]:
def get_perplexity(sentence):
    indices = xlmr.encode(sentence).view(1, -1)    # [B=1 x T]
    feats = xlmr.extract_features(indices)         # [B=1 x T x D]
    scores = xlmr.model.encoder.lm_head(feats)        # [B=1 x T x V]
    log_probs = torch.log_softmax(scores, dim=-1)  # [B=1 x T x V]
    lop_probs = log_probs.gather(dim=2, index=indices.unsqueeze(2)) # [B x T x 1]
    log_prob = lop_probs.squeeze(2).sum(dim=1)    # [B=1 x T] -> [B=1]
    return torch.exp(log_prob).item()

In [9]:
# test
sents = ["this is a good sentence",
         "*%&^( some *^(*&)(^&*(^% gibberish &^$^%$^&*&"]
for sent in sents:
    print('%.6f' % get_perplexity(sent), sent)

0.996101 this is a good sentence
0.000000 *%&^( some *^(*&)(^&*(^% gibberish &^$^%$^&*&


##  Load in our dataset

In [10]:
# read in data
article = []
# titles
titles = []

import os
for root, dirs, files in os.walk("./data/news_text"):
    for file in files:
        if file.endswith(".txt"):
            print(os.path.join(root, file))
            with open(os.path.join(root, file), "r") as input:
                title = input.readline()
                sentences = input.read()
            article.append(sentences)
            titles.append(title)
            
print(len(article))
print(len(titles))

./data/news_text/010.txt
./data/news_text/002.txt
./data/news_text/006.txt
./data/news_text/005.txt
./data/news_text/001.txt
./data/news_text/008.txt
./data/news_text/004.txt
./data/news_text/003.txt
./data/news_text/007.txt
./data/news_text/009.txt
10
10


In [11]:
# concatenate text
text = ''.join(article)

len(text)

24533

In [12]:
# remove reference mark, e.g. '[1]'

import re

text = re.sub(r'\[\d{1,3}\]' ,'', text)
len(text)

24533

In [13]:
# using nltk, split text into sentences

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(text)

len(sentences)

201

In [14]:
p_score =[]
for s in sentences:
    p_score.append(get_perplexity(s))
    
len(p_score)

201

In [21]:
import numpy as np

# get top 5
idx_sort = np.argsort(p_score)
top_5 = np.flip(idx_sort[-5:])
# min 5
min_5 = idx_sort[:5]

print('TOP5:{} -> Value:{}\n'.format(top_5,[p_score[i] for i in top_5]))
for i in top_5:
    print(sentences[i])
print('')
print('MIN5:{} -> Value:{}\n'.format(min_5, [p_score[i] for i in min_5]))
for i in min_5:
    print(sentences[i])

TOP5:[108  27  32  54  67] -> Value:[0.9996752142906189, 0.9994009733200073, 0.9993964433670044, 0.9993668794631958, 0.9993476271629333]

"What I think is shocking to the general public is that these things go on in the House of Commons."
It provides the public with a right of access to information held by about 100,000 public bodies, subject to various exemptions.
Finally, a deal to turn it into a sport and entertainment venue was struck.
"This is a big change in our political culture."
Prime Minister Tony Blair has yet to name the date of the election, but most pundits are betting on 5 May.

MIN5:[ 17 158  53 113 179] -> Value:[1.0729582555768502e-07, 5.489170575856406e-07, 5.9536956541705877e-05, 9.817061072681099e-05, 0.00028531308635137975]

But a Cabinet Office spokeswoman said the move was not about the new laws or "the destruction of important records".
Mr Blair said that whether the public chose Michael Howard or Mr Kennedy, it would result in "a Tory government not a Labour g