# Adding linguistic features to the dataset

## 1 &emsp; Loading the dataset

We'll be using the dataset created in [annotated-merge](./annotated-merge.ipynb) notebook.

In [74]:
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from pymystem3 import Mystem
import re
import warnings

mystem = Mystem()
warnings.simplefilter("ignore")
nltk_stops = set(stopwords.words("russian"))

In [55]:
df = pd.read_csv("./data/csv/annot_merged_raw.csv", sep=";", encoding="utf-8")

tei_types = ["setting", "entrance", "exit", "business", "delivery", 
             "modifier", "location", "unknown"]
for dir_type in tei_types:
    df[dir_type] = df[dir_type].astype(int)
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842


## 2 &emsp; Data preparation

### 2.1 &emsp; Lemmas and stopwords

Let's convert the texts into lists of lemmas (i.e., normalized words).

In [9]:
def extract_lemmas(direction_raw):
    """Performs lemmatization of the directions.

    :arg direction_raw - (str) list of particular play's directions the
    way they were extracted from the text
    
    :uses mystem - an instance of Mystem, morphological analyzer, part of
    pymystem3 module (a Python wrapper for the original programme)

    :returns direction_lemmas - (list of str) same directions yet converted to
    their lemmas"""
    direction_lemmas = []
    words_analyses = mystem.analyze(direction_raw)
    direction_lemmas = [parse["analysis"][0]["lex"] for parse in words_analyses if parse.get("analysis")]
    return direction_lemmas

In [19]:
def drop_stops(direction_lemmas, stops=nltk_stops):
    """Removes lemmas which are in a given stopwords list.
    This is, in fact, equal to the list comprehension like this:
    [lemma for lemma in direction_lemmas if lemma not in stops]
    
    :arg direction_lemmas — (list of str) all the lemmas from a 
    single direction
    
    :uses stops — (iterable of str) words that should be dropped 
    from the direction. By default, this is the set of stopwords 
    taken from NLTK package
    
    :returns non_stop_lemmas — (list of str) resting lemmas without 
    stopwords
    """
    non_stop_lemmas = []
    for lemma in direction_lemmas:
        if lemma not in stops:
            non_stop_lemmas.append(lemma)
    return non_stop_lemmas

### 2.2 &emsp; Morphology
To retrieve morphology information, we'll use Mystem, a rule-based morphological parser.

In [78]:
reg_pos = re.compile("([A-Z]+)")

def get_pos(direction_lemmas):
    """Turns a list of directions' lemmas to their part of speech.
    No stop words are removed at this stage.
    
    :arg direction_lemmas — (list of str) all the lemmas from a 
    single direction
    
    :uses mystem - an instance of Mystem, morphological analyzer, part of
    pymystem3 module (a Python wrapper for the original programme)
    
    :returns direction_pos — (list of str) parts of speech for every
    lemma in the direction
    """
    direction_pos = []
    for dir_lemma in direction_lemmas:
        lemma_analyses = mystem.analyze(dir_lemma)
        all_grammar = lemma_analyses[0]["analysis"][0]["gr"]
        pos = re.search(reg_pos, all_grammar).group(1)
        direction_pos.append(pos)
    return direction_pos

### 2.3 &emsp; Proper nouns

Each proper noun will be replaced with `Имя` in order to be consistent and so that the vector model doesn't pay much attention to the names.

In [52]:
ner_tags = ["имя", "фам", "отч"]

def extract_names(direction_lemmas, direction_pos):
    """Performs a Named Entity Recognition (NER) on the direction.
    Doesn't use specially developed NER algorithms, but relies on
    Mystem morphological analyses.
    
    :arg direction_lemmas — (list of str) all the lemmas from a 
    single direction
    :arg direction_pos — (list of str) parts of speech for every
    lemma in the direction
    
    :uses mystem — (pymystem3.Mystem) an instance of Mystem, 
    rule-based morphological analyzer
    
    :returns ner_lemmas — (list of str) lemmas from a given direction 
    with names replaced with 'Имя'
    :reutrn ner_pos  — (list of str) corresponding parts of speech for
    lemmas that are left; those replaced with 'Имя' get a 'PERSN' tag.
    """
    ner_lemmas = []
    ner_pos = []
    
    for i, dir_lemma in enumerate(direction_lemmas):
        lemma_analyses = mystem.analyze(dir_lemma)
        all_grammar = lemma_analyses[0]["analysis"][0]["gr"]
        j = 0
        
        while j < len(ner_tags):
            if ner_tags[j] in all_grammar:
                ner_lemmas.append("Имя")
                ner_pos.append("PERSN")
                break
            else:
                j += 1
        if j == 3:
            ner_lemmas.append(dir_lemma)
            ner_pos.append(direction_pos[i])
    return ner_lemmas, ner_pos

## 3 &emsp; Applying functions to the dataframe

In [56]:
df["lemmas"] = df["text"].apply(extract_lemmas)
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year,lemmas
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[игрок, князь, звездич, казарин, и, шприх, за,..."
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,"[тихо, первый]"
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,[насмешливо]
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842,"[сквозь, зуб, уходить]"
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[князь, выпивать, стакан, лимонад, садиться, к..."


In [85]:
# creating a list of clear lemmas for later NER
clean_lemmas = [drop_stops(lemma) for lemma in df["lemmas"]]
df["clean_lemmas"] = clean_lemmas
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year,lemmas,clean_lemmas,pos
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[игрок, князь, звездич, казарин, и, шприх, за,...","[игрок, князь, звездич, казарин, шприх, стол, ...","[S, S, S, S, S, S, V, S, V, ADV, V]"
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,"[тихо, первый]","[тихо, первый]","[ADV, ANUM]"
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,[насмешливо],[насмешливо],[ADV]
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842,"[сквозь, зуб, уходить]","[сквозь, зуб, уходить]","[PR, S, V]"
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[князь, выпивать, стакан, лимонад, садиться, к...","[князь, выпивать, стакан, лимонад, садиться, с...","[S, V, S, S, V, S, V]"


In [88]:
clean_pos = [get_pos(lemmas) for lemmas in df["clean_lemmas"].values]
df["pos"] = clean_pos
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year,lemmas,clean_lemmas,pos
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[игрок, князь, звездич, казарин, и, шприх, за,...","[игрок, князь, звездич, казарин, шприх, стол, ...","[S, S, S, S, S, S, V, S, V, ADV, V]"
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,"[тихо, первый]","[тихо, первый]","[ADV, ANUM]"
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,[насмешливо],[насмешливо],[ADV]
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842,"[сквозь, зуб, уходить]","[сквозь, зуб, уходить]","[PR, S, V]"
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[князь, выпивать, стакан, лимонад, садиться, к...","[князь, выпивать, стакан, лимонад, садиться, с...","[S, V, S, S, V, S, V]"


In [91]:
ner_lemmas = []
ner_pos = []

for lemmas, pos in zip(clean_lemmas, clean_pos):
    this_lemmas, this_pos = extract_names(lemmas, pos)
    ner_lemmas.append(this_lemmas)
    ner_pos.append(this_pos)

In [94]:
df["ner_lemmas"] = ner_lemmas
df["ner_pos"] = ner_pos
df.head()

Unnamed: 0,text,setting,entrance,exit,business,delivery,modifier,location,unknown,play,year,lemmas,clean_lemmas,pos,ner_lemmas,ner_pos
0,"Игроки, князь Звездич, Казарин и Шприх. За сто...",1,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[игрок, князь, звездич, казарин, и, шприх, за,...","[игрок, князь, звездич, казарин, шприх, стол, ...","[S, S, S, S, S, S, V, S, V, ADV, V]","[игрок, князь, Имя, Имя, шприх, стол, метать, ...","[S, S, PERSN, PERSN, S, S, V, S, V, ADV, V]"
1,(тихо первому),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,"[тихо, первый]","[тихо, первый]","[ADV, ANUM]","[тихо, первый]","[ADV, ANUM]"
2,(насмешливо),0,0,0,0,1,0,0,0,lermontov-maskarad,1842,[насмешливо],[насмешливо],[ADV],[насмешливо],[ADV]
3,"(сквозь зубы, уходя)",0,0,1,0,1,0,0,0,lermontov-maskarad,1842,"[сквозь, зуб, уходить]","[сквозь, зуб, уходить]","[PR, S, V]","[сквозь, зуб, уходить]","[PR, S, V]"
4,"Князь, выпив стакан лимонаду, садится к сторон...",0,0,0,1,0,0,0,0,lermontov-maskarad,1842,"[князь, выпивать, стакан, лимонад, садиться, к...","[князь, выпивать, стакан, лимонад, садиться, с...","[S, V, S, S, V, S, V]","[князь, выпивать, стакан, лимонад, садиться, с...","[S, V, S, S, V, S, V]"


In [95]:
df.to_csv("./data/csv/directions_morphology.csv", sep=";", 
          encoding="utf-8", index=False)