# Структурные особенности ремарок // Structural peculiarities of stage directions
По мотивам [Сперантов 1998](https://rvb.ru/philologica/05/05sperantov.htm): попытаемся интерпретировать пьесы из RusDraCor с помощью метрик "классичности", предложенных в статье. 

***

In English: see below in each section.

This notebook interprets the article written by Sperantov in 1998 (see it [here](https://rvb.ru/philologica/05/05sperantov.htm)) and applies the same metrics to RusDraCor.

## Коэффициент плотности // Density
Отвечает за частоту появления ремарок.

$$P = \frac{n}{N}\cdot100,\ \ \ \ \ \ \ (1)$$
где $n$ — число ремарок, $N$ — число стихотворных строк.

_Адаптация:_ число стихотворных строк => число реплик (`<sp>`)

***

Shows how frequent the directions are.

$(1)$: $n$ — overall stage directions count, $N$ — overall count of verse lines.

_Adaptation:_ verse lines => `<sp>` tag

In [None]:
import os
import re

from collections import Counter
from lxml import etree
from pymystem3 import Mystem
from statistics import mean

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
tei_ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
corpus_path = ".." + os.sep + "RusDraCor"
directions_path = ".." + os.sep + "directions"

In [None]:
data_total = {}

In [None]:
plays = [file[:-4] for file in os.listdir(corpus_path) if file != ".DS_Store"]
data_total = {"file": plays}

Извлечём дату, чтобы можно было посмотреть на распределение на типы по годам.

In [None]:
def single_date(date_print, date_premiere, date_written):
    if date_print and date_premiere:
        date_definite = min(date_print, date_premiere)
    elif date_premiere:
        date_definite = date_premiere
    else:
        date_definite = date_print
    
    if date_written and date_definite:
        if date_definite - date_written > 10:
            date_definite = date_written
        elif date_written and not date_definite:
            date_definite = date_written
    return date_definite

def get_play_definite_year(play_path):
    root = etree.parse(play_path)
    try:
        written_str = root.find(".//tei:sourceDesc/tei:bibl/tei:bibl/tei:date[@type=\"written\"]", 
                                tei_ns).attrib["when"]
        year_written = int(written_str)
    except:
        year_written = None
    try:
        published_str = root.find(".//tei:sourceDesc/tei:bibl/tei:bibl/tei:date[@type=\"print\"]", 
                                  tei_ns).attrib["when"]
        year_published = int(published_str)
    except:
        year_published = None
    try:
        premiere_str = root.find(".//tei:sourceDesc/tei:bibl/tei:bibl/tei:date[@type=\"premiere\"]", 
                                 tei_ns).attrib["when"]
        year_premiere = int(premiere_str)
    except:
        year_premiere = None
    year_definite = single_date(year_published, year_premiere, year_written)
    return year_definite

In [None]:
data_year = []
for play in plays:
    play_file = corpus_path + os.sep + play + ".xml"
    year = get_play_definite_year(play_file)
    data_year.append(year)

data_total["year"] = data_year

In [None]:
# TODO: regexp => xpath
# xpath_sp = "//sp"

data_sp = []
reg_sp = re.compile("<sp.*?>")

for play in plays:
    play_file = corpus_path + os.sep + play + ".xml"
    # root = etree.parse(play_file)
    # print(root.tostring())
    # sp_all = root.findall(xpath_sp, tei_ns)
    with open(play_file, "r", encoding="utf-8") as f:
        play_text = f.read()
        sp_all = re.findall(reg_sp, play_text)
        print("Play: {}, <sp>: {}".format(play, len(sp_all)))
        data_sp.append(len(sp_all))
        data_total["sp"] = data_sp

In [None]:
data_stage = []
for play in plays:
    directions_file = directions_path + os.sep + play + ".txt"
    with open(directions_file, "r", encoding="utf-8") as f:
        directions_list = f.readlines()
        total_dirs = len(directions_list)
        data_stage.append(total_dirs)
        print("Play: {}, <stage>: {}".format(play, total_dirs))
data_total["stage"] = data_stage

In [None]:
data_p = []
for i, play in enumerate(plays):
    sp = data_sp[i]
    stage = data_stage[i]
    if sp != 0:
        p = stage/sp*100
        print("Play: {}, p={:.4f}".format(play, p))
    else:
        p = 0
        print("Unable to count p for play: {}".format(play))
    data_p.append(p)
data_total["p"] = data_p

In [None]:
df_total = pd.DataFrame.from_dict(data_total)
df_total.head()

In [None]:
def normalize_metric(metric_list):
    metric_min = min(metric_list)
    metric_max = max(metric_list)
    metrics_norm = []
    for metric in metric_list:
        norm = (metric-metric_min)/(metric_max-metric_min)
        metrics_norm.append(norm)
    return metrics_norm

In [None]:
df_total["p_norm"] = normalize_metric(df_total["p"].values)
df_total.head()

## Средняя длина ремарки
$$S = \frac{L}{n},$$
где $L$ — количество словоформ (=слов) в ремарке, $n$ — количество ремарок.

In [None]:
mystem = Mystem()

In [None]:
def average_direction(directions):
    dir_length = []
    for direction in directions:
        length = len([item for item in mystem.analyze(direction) if "analysis" in item])
        dir_length.append(length)
    return mean(dir_length)

In [None]:
data_s = []
for play in plays:
    play_path = directions_path + os.sep + play + ".txt"
    with open(play_path, "r", encoding="utf-8") as f:
        directions_play = f.readlines()
    try:
        s = average_direction(directions_play)
        print("Play: {}, S={:.4f}".format(play, s))
    except:
        s = np.nan
        print("Play: {} failed")
    data_s.append(s)

In [None]:
df_total["s"] = data_s
df_total["s_norm"] = normalize_metric(data_s)
df_total.head()

## Интенсивность взаимодействия стиха и ремарочной прозы
Как часто ремарки появляются внутри стиха?

_Адаптация:_ как часто ремарки появляются внутри речи персонажа (`<sp>`)?

In [None]:
reg_interruption = re.compile("<sp.*?>(\n.*?)<stage", flags=re.MULTILINE)

In [None]:
data_i = []
for i, play in enumerate(plays):
    play_file = corpus_path + os.sep + play + ".xml"
    print(play_file)
    with open(play_file, "r", encoding="utf-8") as f:
        play_text = f.read()
        interruption_all = re.findall(reg_interruption, play_text)
        dirs_play = data_sp[i]
        play_inter = len(interruption_all)/dirs_play
        print("Play: {}, interruptions: {}, share={:.4f}".format(play, len(interruption_all), play_inter))
        data_i.append(play_inter)

In [None]:
len(data_i)

In [None]:
df_total["i"] = data_i
df_total["i_norm"] = normalize_metric(data_i)
df_total.head()

## Коэффициент лексического разнообразия

$$V = \frac{l}{L},$$
где $l$ — число лексем, $L$ — число словоформ.

In [None]:
def lemmas_wforms_dirs(direction):
    lemmas = set()
    wforms = set()
    analysis_full = mystem.analyze(direction)
    for token_analysis in analysis_full:
        if "analysis" in token_analysis:
            try:
                lemm = token_analysis["analysis"][0]["lex"]
                wform = token_analysis["text"]
                lemmas.add(lemm)
                wforms.add(wform)
            except:
                pass
    return lemmas, wforms

In [None]:
data_lexvar = []
for play in plays:
    lemmas = set()
    wordforms = set()
    play_dirs_path = directions_path + os.sep + play + ".txt"
    with open(play_dirs_path, "r", encoding="utf-8") as f:
        play_dirs = [d.strip() for d in f.readlines()]
    for p_dir in play_dirs:
        dir_lemmas, dir_wordforms = lemmas_wforms_dirs(p_dir)
        lemmas.update(dir_lemmas)
        wordforms.update(dir_wordforms)
    lexvar = len(wordforms)/len(lemmas)
    data_lexvar.append(lexvar)
    print("Play: {}, lexvar={:.4f}".format(play, lexvar))

In [None]:
len(data_lexvar)

In [None]:
df_total["l"] = data_lexvar
df_total["l_norm"] = normalize_metric(data_lexvar)
df_total.head()

In [None]:
min(df_total["p_norm"].values)

In [None]:
# TODO: как мы будем считать эмоциональность? sentiment analysis?? вектора??
# TODO: почему умножить, а не разделить?
def average_classic(p, s, i):
    return (p+s+i)/3

In [None]:
data_classic = []
for i, play in enumerate(plays):
    play_p = df_total["p_norm"].values[i] 
    play_s = df_total["s_norm"].values[i] 
    play_i = df_total["i_norm"].values[i]
    print("\nPlay: {}".format(play))
    classic = average_classic(play_p, play_s, play_i)
    print("p={:.5f}, s={:.5f}, i={:.5f}".format(play_p, play_s, play_i))
    print("classic={:.4f}".format(classic))
    data_classic.append(classic)

In [None]:
df_total["classic_index"] = data_classic
df_total.head()

## Types (according to Sperantov)

|**Процент "классичности"**|**Тип пьесы**|
|:----------------------:|:----------:|
|2–10%|достаточно строго следует канону|
|10–20%|канон несколько расшатан|
|20–45%|значительные отступления от канона|
|50–75%|решительно ниспровергают правила классической драматургии|

In [None]:
def get_type(classical):
    play_type = np.nan
    if classical > 0.02 and classical < 0.1:
        play_type = "classic"
    elif classical >= 0.1 and classical < 0.2:
        play_type = "minor_retreat"
    elif classical >= 0.2 and classical < 0.45:
        play_type = "significant_retreat"
    elif classical >= 0.5:
        play_type = "non_classic"
    return play_type

In [None]:
df_total["type"] = df_total["classic_index"].apply(get_type)
df_total["type"].astype("category", inplace=True)
df_total["type"].cat.set_categories(["classic","minor_retreat","significal_retreat","non-classic"],inplace=True)
df_total.head()

### Распределение по типам // Type distribution

Посмотрим на распределение:
***
Let us take a look at the general distribution (i.e. which types are more common than the others).

In [None]:
plt.figure(figsize=(10,10))
df_total["type"].value_counts().plot.bar()
plt.xticks(rotation=360)

Большинство — классические или почти классические.
***
Looks like Russian drama tends to be classic.

### Распределение по годам // Distribution by year

Теперь посмотрим на распределение по типам в зависимости от года. Я разбила все данные на 10 временных отрезков по 20 лет, чтобы уместить всё на одном графике.

Столбцы окрашены в цвет самого "популярного" (частотного) типа.
***
Now we'll look at the distribution by year. To make the visualization clear, I split the whole corpus into 10 groups, each comprising 20 years.

The bars you'll see on the graph below are colored according to the "popular" (frequent) type.

In [None]:
group1 = range(1747, 1767)
group2 = range(1767, 1788)
group3 = range(1788, 1809)
group4 = range(1809, 1830)
group5 = range(1830, 1851)
group6 = range(1851, 1872)
group7 = range(1872, 1893)
group8 = range(1893, 1914)
group9 = range(1914, 1935)
group10 = range(1935, 1956)
order = ["1747–1766", "1767–1787", "1788–1808", "1809–1829", "1830–1850",
         "1851–1872", "1872–1893", "1893–1914", "1914–1934", "1935–1955"]

def group_year(year):
    y = int(year)
    if y in group1:
        return "1747–1766"
    elif y in group2:
        return "1767–1787"
    elif y in group3:
        return "1788–1808"
    elif y in group4:
        return "1809–1829"
    elif y in group5:
        return "1830–1850"
    elif y in group6:
        return "1851–1872"
    elif y in group7:
        return "1872–1893"
    elif y in group8:
        return "1893–1914"
    elif y in group9:
        return "1914–1934"
    elif y in group10:
        return "1935–1955"

In [None]:
df_total["group"] = df_total["year"].apply(group_year)
df_total["group"].astype("category", inplace=True)
df_total["group"].cat.set_categories(order, inplace=True)

df_total.head()

In [None]:
# cmap = {
#     "classic": rgb(133, 164, 172),
#     "small_retreat": rgb(116, 138, 195),
#     "significant_retreat": rgb(116, 118, 195),
#     "non_classic": rgb(131, 116, 195)
# }

In [None]:
# TODO: подписать количества сверху каждого столбца
# TODO: прокрасить столбцы по самому частотному типа (см. cmap выше)
# TODISCUSS: не стоит ли сделать их однотонными? разные цвета могут неправильно понять
plt.figure(figsize=(16,5))
df_total["group"].value_counts().loc[order].plot.bar()
plt.title("Plays: amount and most common, per period", fontsize=16)
plt.xlabel("Group")
plt.ylabel("Number of plays")
plt.xticks(rotation=360)
plt.show()

Ниже представлены подробные распределения по каждому временному отрезку.
***
More specific data on each group is presented below.

In [None]:
df_total[df_total["group"] == "1747–1766"][df_total["type"] == "classic"]

In [None]:
pd.pivot_table(df_total,index=["group"],values=["type"],aggfunc=)

Все пьесы, написанные между 1747 и 1766 годами, классические! На графике однозначно видно, что пьес всего 5, и в таблице выше тоже 5 пьес.

In [None]:
df_total[df_total["group"] == "1767–1787"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1788–1809"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1809–1829"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1830–1850"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1851–1872"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1872–1893"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1893–1914"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1914–1935"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1935–1955"]["type"].value_counts()

In [None]:
df_total[df_total["group"] == "1935–1955"]["type"]

In [None]:
df_total.index.shape, df_total["group"].shape, df_total["file"].shape

|**Group**|*classic*|*small_retreat*|*significant_retreat*|*non_classic*|
|:-------:|:-------:|:-------------:|:-------------------:|:-----------:|
|1767–1787|2        |1              |                     |             |
|1788–1808|5        |3              |                     |             |
|1809–1829|4        |1              |                     |             |
|1830–1850|11       |8              |1                    |             |
|1851–1872|3        |12             |5                    |2            |
|1872–1893|2        |11             |1                    |1            |
|1893–1914|         |6              |6                    |             |
|1914–1935|no data  |no data        |no data              |no data      |
|1935–1955|         |2              |1                    |             |

In [None]:
# TODO: pivot table
# pd.pivot(df_total.index, columns="file", values="group")

# Save the data

In [None]:
# df_total.to_csv("./sperantov_WIP.csv", sep=";", encoding="utf-8")
# df_total = pd.read_csv("./sperantov_WIP.csv", sep=";", encoding="utf-8")