In [1]:
import html
import json
import re
import requests
import os

from tqdm import tqdm

In [2]:
def get_play_names():
    """Makes a request to RusDraCor API to get all play names (used as ids).
    
    :arg request_link - (str) an API link to send the request
    :returns play_names - (list of str) ids for all the plays currently present in the
    corpus"""
    play_names = []
    request_url = "https://dracor.org/api/corpora/rus"
    response = requests.get(request_url)
    if response:
        all_plays = response.json()["dramas"]
        for play in all_plays:
            play_names.append(play["name"])
    return play_names

In [3]:
play_names = get_play_names()

In [12]:
def get_play_text_by_type(play_name, text_type):
    """Uses an API request to get spoken/stage direction text.
    
    :arg play_name (str) - name of a play to work with
    :arg text_type (str) - "spoken" or "stage"
    
    :returns text_type_lines (list of str) - strings of texts of a given type
    to work with later
    """
    if text_type == "spoken":
        request_url = "https://dracor.org/api/corpora/rus/play/{}/spoken-text".format(play_name)
    elif text_type == "stage":
        request_url = "https://dracor.org/api/corpora/rus/play/{}/stage-directions".format(play_name)
    response = requests.get(request_url)
    if response:
        return response.text.split("\n")
    else:
        return []

In [18]:
stage_folder = "./data/stage"
spoken_folder = "./data/spoken"

In [22]:
for play in tqdm(play_names):
    # stage
    play_stage = get_play_text_by_type(play, "stage")
    play_stage_path = stage_folder + "/{}.txt".format(play)
    with open(play_stage_path, "w", encoding="utf-8") as f:
        f.write("\n".join(play_stage))
    # spoken
    play_spoken = get_play_text_by_type(play, "spoken")
    play_spoken_path = spoken_folder + "/{}.txt".format(play)
    with open(play_spoken_path, "w", encoding="utf-8") as f:
        f.write("\n".join(play_spoken))

100%|██████████| 210/210 [03:45<00:00,  1.07s/it]
