In [None]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import spacy

In [None]:
device_id = 4
torch.cuda.set_device(torch.device(f'cuda:{device_id}' if torch.cuda.is_available() else 'cpu'))
print ('Cuda device %s | %s | %s/%sGB' % (torch.cuda.current_device(), torch.cuda.get_device_name(device_id),round(torch.cuda.memory_allocated(device_id)/1024**3,1),round(torch.cuda.memory_reserved(device_id)/1024**3,1)))

In [None]:
nlp = spacy.load("en_core_web_lg")

def spacy_tokenizer(doc):
    tokens = nlp(doc)
    return([token.lemma_.lower() for token in tokens if (token.text.isalnum() and not token.is_stop and not token.is_punct and not token.like_num)])

In [None]:
article_df = pd.read_json(INPUT_FILE_NAME) 

In [None]:
article_df.dropna(subset=['text','title'],inplace=True)

In [None]:
article_df.columns = ['id', 'date', 'title', 'text', 'story'] # drop story column if not available

In [None]:
article_df['sentences'] = [[t] for t in article_df.title]
article_df['sentence_counts'] = ""
article_df['sentence_tokens'] = [[spacy_tokenizer(t)] for t in article_df.title]

In [None]:
all_sentences = []
all_sentence_tokens = []
for text in article_df['text'].values:
    parsed = nlp(text)
    sentences = []
    sentence_tokens = []
    for s in parsed.sents:
        if len(s) > 1:
            sentences.append(s.text)
            sentence_tokens.append([token.lemma_.lower() for token in s if (token.text.isalnum() and not token.is_stop and not token.is_punct and not token.like_num)])
    all_sentences.append(sentences)
    all_sentence_tokens.append(sentence_tokens)

In [None]:
for i in range(len(all_sentences)):
    article_df.at[i,'sentences'] = article_df.loc[i].sentences + all_sentences[i]
    article_df.at[i,'sentence_tokens'] = article_df.loc[i].sentence_tokens + all_sentence_tokens[i]
    article_df.at[i,'sentence_counts'] = len(article_df.loc[i].sentences)

In [None]:
st_model = SentenceTransformer('sentence-transformers/all-roberta-large-v1').cuda() 
#https://www.sbert.net/docs/pretrained_models.html

In [None]:
embeddings = []
errors = []
k = 0
for sentences in article_df['sentences']:
    try:
        embedding = st_model.encode(sentences)
        embeddings.append(embedding)
    except Exception as e:
        errors.append(k)
        print("error at", k, e)

    k = k + 1
    if k % 100 ==0:
        print(k)

In [None]:
article_df['sentence_embds'] = embeddings

In [None]:
for (idx,row) in article_df.iterrows():
    for n in noise_list:
        if n in row['sentences']:
            article_df.drop(idx, inplace = True)
            break

In [None]:
article_df['date'] = [str(k)[:10] for k in article_df['date']]

In [None]:
article_df.sort_values(by=['date'],inplace=True)

In [None]:
article_df.reset_index(inplace= True, drop=True)

In [None]:
article_df['id'] = article_df.index

In [None]:
article_df.to_json(OUTPUT_FILE_NAME)