In [9]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import spacy

In [10]:
device_id = 4
torch.cuda.set_device(torch.device(f'cuda:{device_id}' if torch.cuda.is_available() else 'cpu'))
print ('Cuda device %s | %s | %s/%sGB' % (torch.cuda.current_device(), torch.cuda.get_device_name(device_id),round(torch.cuda.memory_allocated(device_id)/1024**3,1),round(torch.cuda.memory_reserved(device_id)/1024**3,1)))

Cuda device 4 | NVIDIA RTX A6000 | 0.0/0.0GB


## Sentence and keywores (for baselines) tokenization

In [11]:
nlp = spacy.load("en_core_web_lg")

In [48]:
DATASET_NAME = 'CaseStudy' #raw file name 

In [13]:
article_df = pd.read_json(DATASET_NAME+"_raw.json") 

In [14]:
article_df.dropna(subset=['text','title'],inplace=True)

In [17]:
article_df.columns = ['id', 'date', 'title', 'text', 'story', 'query'] # set corresponding column names. Drop 'story' or 'query' (used to collect stories) column if not available

In [18]:
article_df['sentences'] = [[t] for t in article_df.title]
article_df['sentence_counts'] = ""

In [19]:
all_sentences = []
for text in article_df['text'].values:
    parsed = nlp(text)
    sentences = []
    for s in parsed.sents:
        if len(s) > 1:
            sentences.append(s.text)
    all_sentences.append(sentences)

In [20]:
for i in range(len(all_sentences)):
    article_df.at[i,'sentences'] = article_df.loc[i].sentences + all_sentences[i]
    article_df.at[i,'sentence_counts'] = len(article_df.loc[i].sentences)

In [40]:
st_model = SentenceTransformer('sentence-transformers/all-roberta-large-v1').cuda()
# SBERT: sentence-transformers/all-roberta-large-v1
# ST5: sentence-t5-large
#https://www.sbert.net/docs/pretrained_models.html

In [41]:
embeddings = []
errors = []
k = 0
for sentences in article_df['sentences']:
    try:
        embedding = st_model.encode(sentences)
        embeddings.append(embedding)
    except Exception as e:
        errors.append(k)
        print("error at", k, e)

    k = k + 1
    if k % 100 ==0:
        print(k)

100
200
300
400
500
600
700
800
900
1000


In [42]:
article_df['sentence_embds'] = embeddings

In [22]:
article_df['date'] = [str(k)[:10] for k in article_df['date']]

In [23]:
article_df.sort_values(by=['date'],inplace=True)

In [24]:
article_df.reset_index(inplace= True, drop=True)

In [25]:
article_df['id'] = article_df.index

## Masking

In [43]:
def masking(df, idx, num_sens = 50):
    org_embd = torch.tensor(df.loc[idx,'sentence_embds'][:num_sens])
    maksed_embd = torch.zeros(num_sens, org_embd.shape[1])
    mask = torch.ones(num_sens)
    maksed_embd[:org_embd.shape[0], :] = org_embd
    mask[:org_embd.shape[0]] = 0
    
    return maksed_embd, mask

In [45]:
masked = [masking(article_df, idx) for idx in article_df.index]
masked_tensors = torch.stack([m[0] for m in masked])
masks = torch.stack([m[1] for m in masked])

# Save to file

In [28]:
article_df[['id','date','title','sentences','sentence_counts','story','query']].to_json(DATASET_NAME+"_preprocessed.json") #remove 'story' or 'query' if not available 

In [46]:
torch.save(masked_tensors, DATASET_NAME+"_masked_embds.pt")

In [47]:
torch.save(masks, DATASET_NAME+"_masks.pt")