# Annotated directions analysis

In [1]:
import os
import tqdm

## 1 &emsp; Preparation

Gathering paths to plays' `.csv` files and their IDs (which are basically author's name and play title):

In [2]:
path_to_annot = "./data/stage/annotated/"
plays_csv = [play_id for play_id in os.listdir(path_to_annot) if play_id != ".DS_Store"]
play_paths = [path_to_annot + play_id for play_id in plays_csv]
play_ids = [play_csv[:-4] for play_csv in plays_csv]

### 1.1 &emsp; Data not present in .csv

Additional functions: adding year of the play and `play_id`:

In [3]:
import requests
import json

Extracting **year of the play**, fetched from corpus metadata:

In [None]:
request_link = "https://dracor.org/api/corpora/rus/metadata"
response = requests.get(request_link)
if response.status_code == 200:
    play_metadata = response.json()

In [None]:
def append_play_year(play_id, play_metadata):
    """Using metadata from DraCor API, extracts yearNormalized
    for the play.
    
    Calculation of yearNormalized:
    > if date_print and date_premiere:
    >     yearNormalized = min(date_print, date_premiere)
    > elif date_premiere:
    >     yearNormalized = date_premiere
    > else:
    >     yearNormalized = date_print
    > if date_written and yearNormalized:
    >     if yearNormalized - date_written > 10:
    >         yearNormalized = date_written
    >     elif date_written and not yearNormalized:
    >         yearNormalized = date_written
    
    :args play_id (str) — author's name and title of the play
    play_metadata (json) — metadata for all the plays currently
    present in the corpus
    
    :returns year (int) — normalized year of the play
    """
    
    for play in play_metadata:
        if play["playName"] == play_id:
            year = int(play["yearNormalized"])
        else:
            print("Couldn't retrieve year.\n\t- play: {}\n\t- status code: {} \
            \n=> Returning 0".format(play_id, response.status_code))
            year = 0
    return year

## 2 &emsp; Assembling everything together

In [None]:
import pandas as pd

In [None]:
def csv_to_full_df(play_id):
    """Converts .csv files for /data/stage/annotated into a 
    pd.DataFrame, then adds 
        (1) directions' origin, 
        (2) normalized year of the play (see append_play_year()).
            
    :args play_id (str) — author's name and title of the play
    
    :returns play_df (pd.DataFrame) — converted data of a single play
    """
    
    play_csv_path = "./data/stage/annotated/" + play_id +".csv"
    play_df = pd.read_csv(play_csv_path, sep=";", encoding="utf-8").fillna(0)
    
    # convert direction marks to categories
    for dir_type in play_df.columns[1:]:
        play_df[dir_type] = play_df[dir_type].astype("category")
    
    # add play_id so that we know directions' origin
    add_play_name = lambda play_id : play_id
    play_df["play"] = [add_play_name(play_id)] * len(play_df)
    
    # add yearNormalized
    play_df["year"] = [append_play_year(play_id)] * len(play_df)
    
    return play_df

In [None]:
for play_id in tqdm(play_ids):
    try:
        play_df = csv_to_full_df(play_id)
        df_total = pd.concat(df_total, play_df)
    except:
        print(play_id)

Checking that everything is OK:

In [None]:
df_total.head()

In [None]:
len(df_total)

In [None]:
df_total["play"].values

## 3 &emsp; Saving

This data is raw — we haven't done any analysis yet. Nevertheless, it might be useful to save it for later research.

In [None]:
df_total.to_csv("./data/csv/annot_merged_raw.csv", sep=";", encoding="utf-8", index=False)