# Directions and their features

## Imports and globals
Run this before everything else!

In [1]:
import os
from pymystem3 import Mystem
from statistics import mean
from lxml import etree
import pandas as pd

In [2]:
mystem = Mystem()

In [3]:
directions_path = ".." + os.sep + "data"
corpus_path = ".." + os.sep + "RusDraCor"

## Preparations

In [4]:
# this is required because I'm using a Mac, which sometimes creates system folders like .DS_Store;
# in any other case -- never mind
directions_files = [item for item in os.listdir(directions_path) if item.endswith(".txt")]
play_files = [item for item in os.listdir(corpus_path) if item.endswith(".xml")]

## Parts-of-speech
We will count the following parts-of-speech:

* nouns,
* adjectives,
* verbs,
* adverbs,
* interjections.

The values will be respresented as a _fraction_, a result of division of the POS to the total amount of words in the direction.

$$ \text{POS count} = \frac{\text{amount of POS in a direction}}{\text{total amount of words}} $$

Tokenization and word count will be performed via ```pymystem3.Mystem()```, a Python wrapper for [Mystem](https://tech.yandex.ru/mystem/).

In [5]:
def count_pos(direction):
    pos_dict = {"S": 0, "A": 0, "V": 0, "ADV": 0, "INTJ": 0, "words": 0}
    analyses = mystem.analyze(direction)
    for analysis in analyses:
        if "analysis" in analysis:
            try:
                pos = analysis["analysis"][0]["gr"].split(",")[0]
                if "=" in pos:
                        pos = pos[:pos.index("=")]
                        if pos in pos_dict.keys():
                            pos_dict[pos] += 1
                        elif pos == "ADVPRO":
                            pos_dict["ADV"] += 1
                        pos_dict["words"] += 1
            except:
                pass
    return pos_dict

In [6]:
def get_pos_amounts_words(directions_file):
    pos_list = {"S": [], "A": [], "V": [], "ADV": [], "INTJ": [], "words": []}
    full_path = str(directions_path) + os.sep + directions_file
    with open(full_path, "r", encoding="utf-8") as directions_f:
        directions = [line.strip("\n") for line in directions_f.readlines() if line != "\n"]
    for st_dir in directions:
        pos_this = count_pos(st_dir)
        for item in pos_this:
            pos_list[item].append(pos_this[item])
    return pos_list

Now let's crawl the files and create a dataset.

In [7]:
plays_info = []
dummy = {"path": "", "S": 0, "A": 0, "V": 0, "ADV": 0, "INTJ": 0, "words": 0}
for directions_file in directions_files:
    stats = get_pos_amounts_words(directions_file)
    for key in stats:
        stats[key] = mean(stats[key])
    stats["path"] = directions_file
    plays_info.append(stats)

In [8]:
df = pd.DataFrame(plays_info)

In [9]:
df.head()

Unnamed: 0,A,ADV,INTJ,S,V,path,words
0,0.151515,0.30303,0.0,0,0.030303,sumarokov-horev.txt,1.787879
1,0.371859,0.296482,0.0,0,0.135678,gorky-egor-bulychov-i-drugie.txt,1.79397
2,0.180556,0.277778,0.020833,0,0.125,turgenev-vecher-v-sorrente.txt,2.006944
3,0.145161,0.209677,0.112903,0,0.096774,gumilyov-gondla.txt,1.645161
4,0.126506,0.108434,0.012048,0,0.060241,chekhov-leshii.txt,1.080321


## Saving the information
Now, let's save the data.

In [10]:
df.to_csv(".." + os.sep + "data" + os.sep + "mean_values.csv", sep=";", encoding="utf-8")