# Getting initial data

## 1 &emsp; API Interaction

In [1]:
import os

from dracor_api import get_play_names, play_metadata_api, download_directions
from file_work import save_directions

We first use DraCor API (see [here](https://dracor.org/documentation/api/)); the methods for addressing it are in `dracor_api.py`.

If you run this notebook for the first time, run the following cell to create all necessary folders:

In [2]:
directions_folder = ".." + os.sep + "api_data" + os.sep + "directions"
datasets_folder = ".." + os.sep + "api_data" + os.sep + "csv"

if not os.path.exists(directions_folder):
    os.mkdir(directions_folder)
if not os.path.exists(datasets_folder):
    os.mkdir(datasets_folder)

Directions extracted from the corpus will be stored in `directions_raw` subfolder.

In [3]:
directions_raw_folder = ".." + os.sep + "api_data" + os.sep + "directions" + os.sep + "raw"
if not os.path.exists(directions_raw_folder):
    os.mkdir(directions_raw_folder)

In [4]:
play_names = get_play_names()

all_data = {
    "play_id": [],
    "title": [],
    "author": [],
    "characters": [],
    "segments": [],
}

for play_id in play_names:
    directions = download_directions(play_id)
    data = play_metadata_api(play_id)
    print(data)
    if data:
        for key, value in data.items():
            all_data["play_id"].append(play_id)
            all_data[key].append(value)
    if directions:
        save_directions(directions_raw_folder, play_id, directions)

{'title': 'Не убий', 'author': 'Андреев, Леонид Николаевич', 'characters': 23, 'segments': 6}
{'title': 'Мысль', 'author': 'Андреев, Леонид Николаевич', 'characters': 15, 'segments': 5}
{'title': 'Закат', 'author': 'Бабель, Исаак Эммануилович', 'characters': 36, 'segments': 8}
{'title': 'Мария', 'author': 'Бабель, Исаак Эммануилович', 'characters': 22, 'segments': 8}
{'title': 'Король на площади', 'author': 'Блок, Александр Александрович', 'characters': 29, 'segments': 21}
{'title': 'Незнакомка', 'author': 'Блок, Александр Александрович', 'characters': 34, 'segments': 3}
{'title': 'Балаганчик', 'author': 'Блок, Александр Александрович', 'characters': 16, 'segments': 2}
{'title': 'Последние дни (Пушкин)', 'author': 'Булгаков, Михаил Афанасьевич', 'characters': 36, 'segments': 10}
{'title': 'Иван Васильевич', 'author': 'Булгаков, Михаил Афанасьевич', 'characters': 16, 'segments': 3}
{'title': 'Кабала святош (Мольер)', 'author': 'Булгаков, Михаил Афанасьевич', 'characters': 24, 'segments'

{'title': 'Борис Годунов', 'author': 'Пушкин, Александр Сергеевич', 'characters': 79, 'segments': 22}
{'title': 'Русалка', 'author': 'Пушкин, Александр Сергеевич', 'characters': 18, 'segments': 5}
{'title': 'Моцарт и Сальери', 'author': 'Пушкин, Александр Сергеевич', 'characters': 2, 'segments': 1}
{'title': 'Каменный гость', 'author': 'Пушкин, Александр Сергеевич', 'characters': 11, 'segments': 10}
{'title': 'Скупой рыцарь', 'author': 'Пушкин, Александр Сергеевич', 'characters': 5, 'segments': 3}
{'title': 'Сцены из рыцарских времен', 'author': 'Пушкин, Александр Сергеевич', 'characters': 23, 'segments': 14}
{'title': 'Пир во время чумы', 'author': 'Пушкин, Александр Сергеевич', 'characters': 7, 'segments': 2}
{'title': 'Смерть Пазухина', 'author': 'Салтыков-Щедрин, Михаил Евграфович', 'characters': 15, 'segments': 32}
{'title': 'Димитрий Самозванец', 'author': 'Сумароков, Александр Петрович', 'characters': 6, 'segments': 7}
{'title': 'Семира', 'author': 'Сумароков, Александр Петрови

## 2 &emsp; Metadata retrieval

Some data is fetched from RusDraCor API, other (e.g. directions/lemma/wordform count, their average) is computed manually.

In [5]:
from preprocessing import extract_lemmas, extract_pos, prepare_for_classification

When we address the `metadata` request from DraCor API, it returns a CSV file; we first copy it from Downloads folder to the `datasets_folder`.

In [6]:
from shutil import copyfile

In [7]:
# change this to your Downloads folder if you're re-running 
# this on your laptop
# downloads_path = "/Users/dariamaximova/Downloads/metadata.csv"
datasets_path = datasets_folder + os.sep + "metadata.csv"
# copyfile(downloads_path, datasets_path)

Now we are able to open the `metadata.csv` file as a Pandas dataframe to take a more detailed look at plays and information on them.

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv(datasets_path, sep=",", encoding="utf-8")
df.head()

Unnamed: 0,name,genre,year,numOfSegments,numOfActs,numOfSpeakers,yearWritten,yearPremiered,yearPrinted,size,averageClustering,density,averagePathLength,averageDegree,diameter,maxDegree,maxDegreeIds,wikipediaLinkCount
0,andreyev-mysl,tragedy,1914,6,3,15,1913.0,1914.0,1914.0,15,0.810899,0.371429,1.914286,5.2,3,9,kerzhentsev,0
1,andreyev-ne-ubiy,,1913,6,5,23,1913.0,1913.0,1913.0,23,0.912104,0.588933,1.41502,12.956522,3,21,yakov|vasilisa_petrovna,0
2,babel-marija,,1935,8,0,22,,1964.0,1935.0,22,0.876271,0.320346,1.909091,6.727273,3,14,katya,0
3,babel-zakat,,1927,8,0,36,,1927.0,1928.0,36,0.896057,0.396825,1.666667,13.888889,3,31,mendel,0
4,blok-balaganchik,,1906,3,0,16,1906.0,1906.0,1906.0,16,0.942857,0.6,1.4,9.0,2,15,pero|arlekin,0


### 2.1 &emsp; Removing columns from other research

NaN values in `genre` column are replaced with their non-NaN analogue: `unknown`. The column is also converted to `category` type.

We have a `year` column which gets us a single year instead of three different ones — `yearWritten`, `yearPremiered`, `yearPrinted`, so we can drop these.

In [10]:
df["genre"] = df["genre"].fillna("unknown").astype("category")
year_cols = ["yearWritten", "yearPremiered", "yearPrinted"]
df.drop(year_cols, axis=1, inplace=True)

graph_cols = ["size", "averageClustering", "density", "averagePathLength", 
              "averageDegree", "diameter", "maxDegree", "maxDegreeIds"]
df.drop(graph_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,name,genre,year,numOfSegments,numOfActs,numOfSpeakers,wikipediaLinkCount
0,andreyev-mysl,tragedy,1914,6,3,15,0
1,andreyev-ne-ubiy,unknown,1913,6,5,23,0
2,babel-marija,unknown,1935,8,0,22,0
3,babel-zakat,unknown,1927,8,0,36,0
4,blok-balaganchik,unknown,1906,3,0,16,0


### 2.2 &emsp; Adding columns regarding directions

Now we have to add information regarding directions data we indicated earlier:

* directions/lemma/wordform count, 
* lemma/wordform average per play

That's already five new features!

In [11]:
from file_work import load_directions

Each type of directions processing (lemmas, POS tags, lemmas + POS) will be saved in a separate folder.

In [12]:
directions_pos_folder = ".." + os.sep + "api_data" + os.sep + "directions" + os.sep + "pos"
if not os.path.exists(directions_pos_folder):
    os.mkdir(directions_pos_folder)
    
directions_lemmas_folder = ".." + os.sep + "api_data" + os.sep + "directions" + os.sep + "lemmas"
if not os.path.exists(directions_lemmas_folder):
    os.mkdir(directions_lemmas_folder)
    
directions_lemmaspos_folder = ".." + os.sep + "api_data" + os.sep + "directions" + os.sep + "lemmaspos"
if not os.path.exists(directions_lemmaspos_folder):
    os.mkdir(directions_lemmaspos_folder)

For lemmatisation, we use Mystem, rule-based morphological analyzer developed by Ilya Segalivich and maintained by Yandex. Its Python wrapper is `pymystem3`.

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from pymystem3 import Mystem
from statistics import mean

In [17]:
additional_features = {
    "directionsCount": [],
    "directionsTokens": [],
    "directionsTokensAverage": [],
    "directionsLemmas": [],
    "directionsLemmasAverage": [],
    "directionsPOS": [],
    "directionsPOSAverage": []
}
mystem = Mystem()
stops = stopwords.words("russian")

In [18]:
total_plays = len(df["name"])
for i, play_id in enumerate(df["name"]):
    directions_raw = load_directions(directions_raw_folder, play_id)
    directions_count = len(directions_raw)
    
    additional_features["directionsCount"].append(len(directions_raw))
    
    tokens_count = wordpunct_tokenize(" ".join(directions_raw))
    additional_features["directionsTokens"].append(len(tokens_count))
    tokens_average = len(tokens_count)/directions_count
    additional_features["directionsTokensAverage"].append(tokens_average)
    
    directions_lemmas = extract_lemmas(directions_raw, mystem)
    lemmas_count = sum([len(line) for line in directions_lemmas])
    additional_features["directionsLemmas"].append(lemmas_count)
    lemmas_mean = mean([len(line) for line in directions_lemmas])
    additional_features["directionsLemmasAverage"].append(lemmas_mean)
    directions_lemmas_str = [" ".join(lemmas) for lemmas in directions_lemmas]
    save_directions(directions_lemmas_folder, play_id, directions_lemmas_str)
    
    directions_pos = extract_pos(directions_raw, mystem)
    pos_count = sum([len(pos_line) for pos_line in directions_pos])
    additional_features["directionsPOS"].append(pos_count)
    pos_mean = mean([len(pos_line) for pos_line in directions_pos])
    additional_features["directionsPOSAverage"].append(pos_mean)
    directions_pos_str = [" ".join(pos) for pos in directions_pos]
    save_directions(directions_pos_folder, play_id, directions_pos_str)
    
    print("Play {}, id: {}, ready: {:.2f}%".format(i+1, play_id, ((i+1)/total_plays)*100))

Play 1, id: andreyev-mysl, ready: 0.71%
Play 2, id: andreyev-ne-ubiy, ready: 1.42%
Play 3, id: babel-marija, ready: 2.13%
Play 4, id: babel-zakat, ready: 2.84%
Play 5, id: blok-balaganchik, ready: 3.55%
Play 6, id: blok-korol-na-ploschadi, ready: 4.26%
Play 7, id: blok-neznakomka, ready: 4.96%
Play 8, id: bulgakov-beg, ready: 5.67%
Play 9, id: bulgakov-dni-turbinyh, ready: 6.38%
Play 10, id: bulgakov-ivan-vasilevich, ready: 7.09%
Play 11, id: bulgakov-kabala-svjatosh, ready: 7.80%
Play 12, id: bulgakov-poloumnyj-zhurden, ready: 8.51%
Play 13, id: bulgakov-poslednie-dni, ready: 9.22%
Play 14, id: chekhov-chaika, ready: 9.93%
Play 15, id: chekhov-djadja-vanja, ready: 10.64%
Play 16, id: chekhov-ivanov, ready: 11.35%
Play 17, id: chekhov-jubilej, ready: 12.06%
Play 18, id: chekhov-leshii, ready: 12.77%
Play 19, id: chekhov-medved, ready: 13.48%
Play 20, id: chekhov-na-bolshoi-doroge, ready: 14.18%
Play 21, id: chekhov-noch-pered-sudom, ready: 14.89%
Play 22, id: chekhov-predlozhenie, read

In [19]:
for key, value in additional_features.items():
    df[key] = value
df.head()

Unnamed: 0,name,genre,year,numOfSegments,numOfActs,numOfSpeakers,wikipediaLinkCount,directionsCount,directionsTokens,directionsTokensAverage,directionsLemmas,directionsLemmasAverage,directionsPOS,directionsPOSAverage
0,andreyev-mysl,tragedy,1914,6,3,15,0,233,2748,11.793991,1976,8.480687,1976,8.480687
1,andreyev-ne-ubiy,unknown,1913,6,5,23,0,341,4334,12.709677,3131,9.181818,3131,9.181818
2,babel-marija,unknown,1935,8,0,22,0,137,1632,11.912409,1141,8.328467,1141,8.328467
3,babel-zakat,unknown,1927,8,0,36,0,293,3412,11.645051,2474,8.443686,2474,8.443686
4,blok-balaganchik,unknown,1906,3,0,16,0,38,1091,28.710526,864,22.736842,864,22.736842


Now, the CSV contains following data:

* _genre_: tragedy / comedy / unknown

* _name_: `play_id` to address the API

* _year_: single year for each play, chosen between written, published and staged plays,

* _numOfSegments_: number of the smallest segments (scenes) in the play,

* _numOfActs_,

* _numOfSpeakers_,

* _wikipediaLinkCount_: how many Wikipedia articles have a link to this Wikidata item

* _directionsCount_, _directionsTokens_, _directionsTokensAverage_, _directionsLemmas_, _directionsLemmasAverage_, _directionsPOS_, _directionsPOSAverage_ — from `additional_features`

In [20]:
df_path = os.path.join(datasets_folder, "metadata_with_directions.csv")

In [21]:
df.to_csv(df_path, sep=";", encoding="utf-8")