# Creating datasets for each direction type

In [1]:
import numpy as np
import os
import pandas as pd

from collections import Counter
from sklearn.model_selection import train_test_split

## 1 &emsp; Parsing input dataset

First, we have to red the dataset and convert strings in NER values (columns `ner_lemmas` and `ner_pos`) to list of strings to work with later.

In [2]:
df = pd.read_csv("./data/csv/directions_morphology.csv", 
                 sep=";", encoding="utf-8")
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year,lemmas,clean_lemmas,pos,ner_lemmas,ner_pos
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842,"['игрок', 'князь', 'звездич', 'казарин', 'и', ...","['игрок', 'князь', 'звездич', 'казарин', 'шпри...","['S', 'S', 'S', 'S', 'S', 'S', 'V', 'S', 'V', ...","['игрок', 'князь', 'Имя', 'Имя', 'шприх', 'сто...","['S', 'S', 'PERSN', 'PERSN', 'S', 'S', 'V', 'S..."
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,"['тихо', 'первый']","['тихо', 'первый']","['ADV', 'ANUM']","['тихо', 'первый']","['ADV', 'ANUM']"
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,['насмешливо'],['насмешливо'],['ADV'],['насмешливо'],['ADV']
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842,"['сквозь', 'зуб', 'уходить']","['сквозь', 'зуб', 'уходить']","['PR', 'S', 'V']","['сквозь', 'зуб', 'уходить']","['PR', 'S', 'V']"
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842,"['князь', 'выпивать', 'стакан', 'лимонад', 'са...","['князь', 'выпивать', 'стакан', 'лимонад', 'са...","['S', 'V', 'S', 'S', 'V', 'S', 'V']","['князь', 'выпивать', 'стакан', 'лимонад', 'са...","['S', 'V', 'S', 'S', 'V', 'S', 'V']"


In [3]:
def normalize_ner(ner_raw):
    """Parses a string of NER columns values to work with it later:
    - separates the values,
    - strips redundant 
    Stripping extra characters of values in a given string list,
    so that they do not make extra noise with later preprocessing.
    
    :arg ner_raw — (str) list of NER values to parse, all in a single 
    string
    
    :returns ner_cleaned — (list of str) list of NER values as separate objects
    """
    ner_separated = ner_raw.split(", ")
    ner_cleaned = [item.strip(", ''][") for item in ner_separated]
    return ner_cleaned

In [4]:
df["ner_lemmas"] = df["ner_lemmas"].apply(normalize_ner)
df["ner_pos"] = df["ner_pos"].apply(normalize_ner)
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year,lemmas,clean_lemmas,pos,ner_lemmas,ner_pos
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842,"['игрок', 'князь', 'звездич', 'казарин', 'и', ...","['игрок', 'князь', 'звездич', 'казарин', 'шпри...","['S', 'S', 'S', 'S', 'S', 'S', 'V', 'S', 'V', ...","[игрок, князь, Имя, Имя, шприх, стол, метать, ...","[S, S, PERSN, PERSN, S, S, V, S, V, ADV, V]"
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,"['тихо', 'первый']","['тихо', 'первый']","['ADV', 'ANUM']","[тихо, первый]","[ADV, ANUM]"
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,['насмешливо'],['насмешливо'],['ADV'],[насмешливо],[ADV]
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842,"['сквозь', 'зуб', 'уходить']","['сквозь', 'зуб', 'уходить']","['PR', 'S', 'V']","[сквозь, зуб, уходить]","[PR, S, V]"
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842,"['князь', 'выпивать', 'стакан', 'лимонад', 'са...","['князь', 'выпивать', 'стакан', 'лимонад', 'са...","['S', 'V', 'S', 'S', 'V', 'S', 'V']","[князь, выпивать, стакан, лимонад, садиться, с...","[S, V, S, S, V, S, V]"


We also separate feature NER columns from direction types, as the resulting datasets will only contain NER values + one direction type tag (1/0). There will be as many datasets as there are direction types.

In [5]:
feature_columns = ["ner_lemmas", "ner_pos"]
direction_types = ["setting", "entrance", "exit", "business", "delivery",
                   "modifier", "location", "unknown"]

## 2 &emsp; Converting morphology tags

Parts of speech tags are taken from Mystem tagset which is not used in the  word2vec model — we should convert it into Universal Tags.

Information on tagsets:

- *Mystem:* [description (rus)](https://tech.yandex.ru/mystem/doc/grammemes-values-docpage/)

- *Universal Tags:* [documentation (eng)](http://universaldependencies.org/u/pos/all.html)

In [6]:
# only for the differing tags
pos_map = {
    "A": "ADJ",
    "ANUM": "ADJ",
    "ADVPRO": "ADV",
    "CONJ": "SCONJ",
    "PR": "ADP",
    "PERSN": "NOUN",
    "S": "NOUN",
    "SPRO": "NOUN",
    "V": "VERB",
    "": "NONE"
}

def create_wv_items(dir_lemmas, dir_pos):
    """Creates lemma+POS pairs so that they can be passed into
    word2vec model without further changes.
    
    :arg lemmas — (list of str) lemmas to use
    :arg pos — (list of str) parts of speech for the correpsponding 
    lemmas; i.e. pos[1] is the part of speech for lemmas[1]
    
    :returns final_items — (list of str) formatted tokens ready 
    for use in word2vec model of choice
    """
    final_items = []
    for i, lemma in enumerate(lemmas):
        this_pos = dir_pos[i]
        if this_pos in pos_map:
            this_pos = pos_map[this_pos]
        wv_item = "{}_{}".format(lemma, this_pos)
        final_items.append(wv_item)
    return final_items

In [7]:
wv_lemmas_pos = []

for i, lemmas in enumerate(df["ner_lemmas"].values):
    this_dir_lems = df["ner_lemmas"].values[i]
    this_dir_pos = df["ner_pos"].values[i]
    wv_ = create_wv_items(this_dir_lems, this_dir_pos)
    wv_lemmas_pos.append(wv_)

df["wv_items"] = wv_lemmas_pos

Now, the `wv_items` column consists of items that are easy to use with the w2v model.

In [8]:
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year,lemmas,clean_lemmas,pos,ner_lemmas,ner_pos,wv_items
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842,"['игрок', 'князь', 'звездич', 'казарин', 'и', ...","['игрок', 'князь', 'звездич', 'казарин', 'шпри...","['S', 'S', 'S', 'S', 'S', 'S', 'V', 'S', 'V', ...","[игрок, князь, Имя, Имя, шприх, стол, метать, ...","[S, S, PERSN, PERSN, S, S, V, S, V, ADV, V]","[игрок_NOUN, князь_NOUN, Имя_NOUN, Имя_NOUN, ш..."
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,"['тихо', 'первый']","['тихо', 'первый']","['ADV', 'ANUM']","[тихо, первый]","[ADV, ANUM]","[тихо_ADV, первый_ADJ]"
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,['насмешливо'],['насмешливо'],['ADV'],[насмешливо],[ADV],[насмешливо_ADV]
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842,"['сквозь', 'зуб', 'уходить']","['сквозь', 'зуб', 'уходить']","['PR', 'S', 'V']","[сквозь, зуб, уходить]","[PR, S, V]","[сквозь_ADP, зуб_NOUN, уходить_VERB]"
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842,"['князь', 'выпивать', 'стакан', 'лимонад', 'са...","['князь', 'выпивать', 'стакан', 'лимонад', 'са...","['S', 'V', 'S', 'S', 'V', 'S', 'V']","[князь, выпивать, стакан, лимонад, садиться, с...","[S, V, S, S, V, S, V]","[князь_NOUN, выпивать_VERB, стакан_NOUN, лимон..."


## 3 &emsp; Splitting morphology tags into separate columns

So far, all the POS tags are stored in `ner_pos` column as lists of str (which are tags for corresponding lemmas). To make life easier at the machine learning stage, let's create columns for parts of speech present in `wv_items`, then count the amount of every part of speech in a single direction and save that data in its column.

In [15]:
# creating a set of all POS options in a play
all_tags_mystem = set()
for tag_line in df["ner_pos"].values:
    for tag in tag_line:
        all_tags_mystem.add(tag)

In [None]:
# converting to UD
all_tags = set()
for i, tag in enumerate(all_tags_mystem):
    if tag in pos_map:
        print("Was: {}".format(tag))
        tag = pos_map[tag]
        print("Now: {}".format(tag))
    print("Adding {}".format(tag))
    print(i)
    all_tags.add(tag)

In [None]:
# creating dataframe columns
for tag in all_tags_mystem:
    df[tag] = [0 for x in df["ner_pos"].values]

In [None]:
mystem_amounts = {mystem_tag: [0] for mystem_tag in all_tags_mystem}
for direction_types in df["ner_pos"].values:
    mystem_amounts = {mystem_tag: 0 for mystem_tag in all_tags_mystem}

In [40]:
mystem_amounts = {mystem_tag: 0 for mystem_tag in all_tags_mystem}
for item in [0]:
    mystem_amounts[item] += 1

In [41]:
mystem_amounts

{'': 0,
 'PERSN': 2,
 'ADV': 1,
 'PART': 0,
 'PR': 0,
 'A': 0,
 'INTJ': 0,
 'ADVPRO': 0,
 'S': 5,
 'APRO': 0,
 'V': 3,
 'SPRO': 0,
 'ANUM': 0,
 'NUM': 0}

In [None]:
# inserting values with collections.Counter() for collecting amounts
for tag in all_tags_mystem:
    total_class = []
    for pos_counts in df["ner_pos"].values:
        c = Counter()
        c.update(pos_counts)
        if tag in c.keys():
            total_class.append(c[tag])
        else:
            total_class.append(0)
    df[tag] = total_class

In [None]:
# sanity check
df.head()

Now, renaming columns once again for 

## 3 &emsp; Saving datasets for each type of directions

For each type, we will be exporting data from the following columns:

- `wv_items` to get w2v vectors,
- POS columns from `all_tags` to get POS amounts.

Data is split into train, validation and test. For training, we'll use 2/3 of data (~4350 directions), of which 20% (~870) will be a validation set. The resting 2230 direction will be used as a test set.

Datasets are split differently for each direction type.

In [None]:
for dirtype in direction_types:
    # selecting columns: 
    X = df[["wv_items"] + list(all_tags)]
    y = df[dirtype]
    
    # train+val, test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, 
                                                        random_state=1968)
    # train, val
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                random_state=1968)
    
    # save to separate files
    X_train.to_csv("./data/ml/{}_X_train.csv".format(dirtype), sep=";", 
                  encoding="utf-8", index=False)
    X_val.to_csv("./data/ml/{}_X_val.csv".format(dirtype), sep=";", 
                  encoding="utf-8", index=False)
    X_test.to_csv("./data/ml/{}_X_test.csv".format(dirtype), sep=";", 
                  encoding="utf-8", index=False)
    
    with open("./data/ml/{}_y_train.csv".format(dirtype), "w", encoding="utf-8") as f:
        f.write("\n".join([str(y) for y in y_train]))
    with open("./data/ml/{}_y_val.csv".format(dirtype), "w", encoding="utf-8") as f:
        f.write("\n".join([str(y) for y in y_val]))
    with open("./data/ml/{}_y_test.csv".format(dirtype), "w", encoding="utf-8") as f:
        f.write("\n".join([str(y) for y in y_test]))