## Udpiping and preprocessing

What is here

- we take a subset of texts from our data
- use udpipe russian-gsd-ud-2.3-181115
- convert conllu to pd.df
- write each text in a separate .csv file
- read from each file and preprocess them (add features, filter columns)
- rewrite each df (text) in a separate .csv file that are in the ./data/texts_udpipe folder

In [1]:
import pandas as pd
import warnings
import ufal.udpipe

import numpy as np

import os
import os.path

warnings.filterwarnings('ignore')

In [2]:
# reading the subset files
# TODO: replace with loading data not from local files
texts = {}

for i in range(7, 13):
    text_name = f"detcorpus ({i}).txt"
    with open(f"/Users/macbook/Downloads/detcorpus ({i}).txt") as f:
        texts[text_name] = f.read()

In [3]:
# Google implementation of udpipe
# TODO: replace with custom udpiper

udpipe_model = "/Users/macbook/Downloads/russian-syntagrus-ud-2.3-181115.udpipe"

import ufal.udpipe

class Model:
    def __init__(self, path):
        """Load given model."""
        self.model = ufal.udpipe.Model.load(path)
        if not self.model:
            raise Exception("Cannot load UDPipe model from file '%s'" % path)

    def tokenize(self, text):
        """Tokenize the text and return list of ufal.udpipe.Sentence-s."""
        tokenizer = self.model.newTokenizer(self.model.DEFAULT)
        if not tokenizer:
            raise Exception("The model does not have a tokenizer")
        return self._read(text, tokenizer)

    def read(self, text, in_format):
        """Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
        input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
        if not input_format:
            raise Exception("Cannot create input format '%s'" % in_format)
        return self._read(text, input_format)

    def _read(self, text, input_format):
        input_format.setText(text)
        error = ufal.udpipe.ProcessingError()
        sentences = []

        sentence = ufal.udpipe.Sentence()
        while input_format.nextSentence(sentence, error):
            sentences.append(sentence)
            sentence = ufal.udpipe.Sentence()
        if error.occurred():
            raise Exception(error.message)

        return sentences

    def tag(self, sentence):
        """Tag the given ufal.udpipe.Sentence (inplace)."""
        self.model.tag(sentence, self.model.DEFAULT)

    def parse(self, sentence):
        """Parse the given ufal.udpipe.Sentence (inplace)."""
        self.model.parse(sentence, self.model.DEFAULT)

    def write(self, sentences, out_format):
        """Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""

        output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
        output = ''
        for sentence in sentences:
            output += output_format.writeSentence(sentence)
        output += output_format.finishDocument()

        return output

In [4]:
%%time
from conllu import parse

model = Model(udpipe_model)
udpipe_sents = []

for text in texts:    
    sentences = model.tokenize(texts[text])
    for s in sentences:
        model.tag(s)
        model.parse(s) 
    data = model.write(sentences, "conllu")
    sents = parse(data)
    udpipe_sents.append(sents)
    
# list of lists of lists: each element is udpipe output for a text 
# that is a list of words with their features
udpipe_inds = [[(sent+1, udpipe_sents[text][sent][word]) 
               for sent in range(0, len(udpipe_sents[text])) 
               for word in range(0, len(udpipe_sents[text][sent]))]
             for text in range(0, len(udpipe_sents))]

# writing all the dfs of texts into separate .csvs
for text in range(len(udpipe_inds)):
    udpipe_df_inds = pd.DataFrame(udpipe_inds[text], columns = ['sent_id', 'word'])
    udpipe_df = pd.DataFrame(pd.DataFrame(udpipe_df_inds)['word'].tolist())
    udpipe_df = pd.merge(udpipe_df_inds['sent_id'], 
                     udpipe_df, left_index=True, right_index=True, how='right')
    udpipe_df.to_csv(f"./data/texts_udpipe/{text}_text.csv", index=False)

CPU times: user 1min 33s, sys: 883 ms, total: 1min 34s
Wall time: 1min 35s


In [5]:
# TODO: I would rather read all the features but it is computationally less efficient 
# than considering only those variables that are needed for rules
# DON'T uncomment the following line
# feature_sets = [{ind: list(vals['feats'].items())} for ind, vals in udpipe_df.iterrows() if vals['feats']]

In [2]:
def preprocessing_data(folder_name):
    files = [os.path.join(dirpath, filename)
             for dirpath, dirnames, filenames in os.walk(f"./{folder_name}") 
             for filename in [f for f in filenames if f.endswith(".csv")]]
    
    for file in files:
        
        udpipe_df = pd.read_csv(file)
        
        # defining variables needed for rules
        udpipe_df.feats = udpipe_df.feats.astype(str)
        udpipe_df['anim'] = np.where(udpipe_df.feats.str.contains("'Animacy', 'Anim'"), 1, 0)
        udpipe_df['anim'] = np.where(udpipe_df.xpostag == "PRP", 1, udpipe_df['anim'])
        udpipe_df['gender'] = np.where(udpipe_df.feats.str.contains("'Gender', 'Fem'"), "Fem", np.nan)
        udpipe_df['gender'] = np.where(udpipe_df.feats.str.contains("'Gender', 'Masc'"), "Masc", udpipe_df['gender'])
        udpipe_df['number'] = np.where(udpipe_df.feats.str.contains("'Number', 'Sing'"), "Sing", np.nan)
        udpipe_df['number'] = np.where(udpipe_df.feats.str.contains("'Number', 'Plur'"), "Plur", udpipe_df['number'])

        udpipe_df['pronoun_person'] = np.where(udpipe_df.upostag == "PRON", "Non-deictic", np.nan)
        udpipe_df['pronoun_person'] = np.where((udpipe_df.upostag == "PRON") & 
                                        ((udpipe_df.feats.str.contains("'Person', '1'") |
                                          (udpipe_df.feats.str.contains("'Person', '2'")))), 
                                         "Deictic", udpipe_df['pronoun_person'])

        udpipe_df['hero'] = np.where((udpipe_df.anim == 1) & 
                                     (udpipe_df.deprel.str.contains("nsubj") == True), 
                                     1, 0)

        udpipe_df['uuid'] = [str(s) + str(h) 
                             for s,h in zip(
                                 udpipe_df.sent_id.tolist(), 
                                 udpipe_df.id.tolist())]

        udpipe_df['head_uuid'] = [str(s) + str(h) 
                                  for s,h in zip(
                                      udpipe_df.sent_id.tolist(), 
                                      udpipe_df['head'].tolist())]
        
        udpipe_df.to_csv(file, index=False)

In [3]:
preprocessing_data("/data/texts_udpipe")