# TextRank Testing

Here, we will explore the Textrank algorithm implemented in pytextrank.

In [1]:
import spacy
import pytextrank
from datasets import load_dataset, Dataset

#Converts data in src [TAB] tgt [NEWLINE] format to a format suitable for model training
def convertToDictFormat(data):
    source = []
    target = []
    for example in data:
        example = example.strip()
        sentences = example.split("\t")
        source.append(sentences[0])
        target.append(sentences[1])
    ready = Dataset.from_dict({"en":source, "fr":target})
    return ready

In [2]:
train_data = load_dataset("ethansimrm/wmt_16_19_22_biomed_train_processed", split = "train")
train_data_ready = convertToDictFormat(train_data['text'])

Found cached dataset text (C:/Users/ethan/.cache/huggingface/datasets/ethansimrm___text/ethansimrm--wmt_16_19_22_biomed_train_processed-8662b34233d7661e/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


In [116]:
train_sampled = train_data_ready.train_test_split(train_size = 0.1, seed = 42)["train"]

In [122]:
#Sample 10% of corpus sentences due to space/efficiency issues - building a graph is very expensive. We will take a random sample a la Ailem for this.
from tqdm import tqdm
corpus = ""
for sent in tqdm(train_sampled['fr'][:50]):
    corpus += sent + " "

100%|██████████| 50/50 [00:00<00:00, 25016.72it/s]


In [70]:
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop #We will feed this into the stop words recogniser and the scrubber function
fr_stop_words = list(fr_stop)

In [63]:
sw = spacy.load("fr_dep_news_trf")

In [88]:
stop_words = {}
for word in fr_stop_words:
    doc = sw(word)
    for tok in doc:
        stop_words[tok.text] = [tok.pos_]

In [128]:
from spacy.tokens import Span
@spacy.registry.misc("prefix_scrubber")
def prefix_scrubber():
    def scrubber_func(span: Span) -> str:
        for token in span:
            if token.pos_ not in ["DET", "PRON"]:
                break
            span = span[1:]
        return span.text
    return scrubber_func

In [129]:
nlp = spacy.load("fr_dep_news_trf") #We will then run through the sampled corpus using textrank.
nlp.add_pipe("textrank", config={"stopwords" : stop_words, "scrubber": {"@misc": "prefix_scrubber"}})
nlp.max_length = 1000000 #7m chars for 10%

In [130]:
doc = nlp(corpus, disable = ["ner"]) #Kick out NER to make it run faster

In [131]:
output = open("textrank_phrases_and_scores.txt", "w", encoding = "utf-8")
for phrase in doc._.phrases:
    output.write(str(phrase.text) + "\t" + str(phrase.rank) + "\n")
output.close()