# Data: preprocessing

In [1]:
# !pip install docx2txt
# !pip install doc2docx
# !pip install chardet1
# !pip install zstandard
# !pip install jsonlines
# !pip install transliterate

import os
import re
import json
from unicodedata import normalize
from itertools import chain
from collections import Counter

import pickle
import docx2txt
import chardet
import Levenshtein
import spacy
import numpy as np
import pandas as pd
import pyarrow as pa
from tqdm.auto import tqdm
from doc2docx import convert
from unidecode import unidecode
from transliterate import translit
from datasets import load_dataset

In [2]:
def walk_corpus(dirname):
    files = []
    for cont in os.walk(dirname):
        for file in cont[-1]:
            files.append(os.path.join(cont[0], file))
    return files

In [3]:
def implode_spaces(text):
    text = re.sub(r'\s*?[\r\n]+\s*?', '\n', text, flags=re.DOTALL)
    text = re.sub(r'( |\t)+', ' ', text)
    return text.strip()

In [4]:
def replace_nbsp(text):
    return normalize('NFKD', text).replace('\u2028', '\n')

In [5]:
def lower_text(text):
    if pd.isnull(text):
        return np.nan
    return text.lower().strip()

In [6]:
def capitalize_names(text):
    if pd.isnull(text) or text.strip() in {'-', '–'}:
        return np.nan
    return text.strip().title()

In [7]:
def fight_yot(text):
    if isinstance(text, str):
        return text.replace('й', 'й')
    if pd.isnull(text):
        return np.nan
    return text

In [8]:
def get_unique(col):
    vals = [val for val in col if not pd.isnull(val)]
    try:
        return list(set(vals))[0]
    except IndexError:
        return np.nan

In [9]:
def estimate_length(text):
    return len(text.split())

## Pikabu
* Pikabu dataset ([Source](https://huggingface.co/datasets/IlyaGusev/pikabu))
* L1 Russian
* informal
* no annotation
* ✔ clean meta

In [None]:
# dataset = load_dataset('IlyaGusev/pikabu', split='train', trust_remote_code=True)

In [None]:
# src_dir = 'C:\\Users\\Елизавета Клыкова\\.cache\\huggingface\\datasets\\IlyaGusev___pikabu\\default\\0.0.1\\e8b165a9559d99e3bb36dab8e48521decc2be36ad86757c472b506d7788a96f5'
# files = [os.path.join(src_dir, file) for file in os.listdir(src_dir) if file.endswith('.arrow')]
# len(files)

In [None]:
# dfs = []
# for file in tqdm(files):
#     with pa.memory_map(file) as source:
#         array = pa.RecordBatchStreamReader(source).read_all()
#         df = array.to_pandas()
#         df = df[['id', 'title', 'text_markdown', 'timestamp', 'author_id',
#                  'username', 'rating', 'pluses', 'minuses', 'url', 'tags']]
#         df['tags'] = df['tags'].map(' | '.join)
#         df = df.convert_dtypes()
#         print(df.shape)
#         dfs.append(df)

In [None]:
# data = pd.concat(dfs)
# data = data[['id', 'url', 'title', 'text_markdown', 'timestamp', 'author_id', 'pluses', 'minuses']]
# data.rename(columns={'text_markdown': 'text', 'id': 'doc_id', 'timestamp': 'date'}, inplace=True)
# data.dropna(subset=['text'], inplace=True)
# data['num_words'] = data['text'].map(estimate_length)
# data = data[data['num_words'] != 0]
# print(data.shape)
# data.head(3)

In [None]:
# data.to_csv('pikabu.tsv', sep='\t', index=False)

## Reddit
* Webis-TLDR-17 Corpus ([Source](https://zenodo.org/records/1043504#.Wzt7PbhXryo))
* L1 English
* informal
* no annotation
* ✔ clean meta

In [None]:
# keys = {'id', 'title', 'author', 'subreddit', 'subreddit_id',
#         'content', 'content_len', 'summary', 'summary_len'}
# lines = []
# with open('reddit/corpus-webis-tldr-17.json', encoding='utf-8') as f:
#     for i in tqdm(range(3848331)):
#         line = next(f)
#         one = json.loads(line)
#         if 'title' in one:
#             lines.append({key: one[key] for key in keys if key in one})

In [None]:
# data = pd.DataFrame(data)
# data.rename(columns={'id': 'doc_id', 'author': 'author_id', 'summary': 'prompt'}, inplace=True)
# print(data.shape)
# data.head()

In [None]:
# data.to_csv('reddit.tsv', sep='\t', index=False)

## LOCNESS
* Louvain Corpus of Native English Essays ([Source](https://uclouvain.be/en/research-institutes/ilc/cecl/locness.html))
* L1 English
* academic
* no annotation
* ✔ clean meta

In [9]:
files = [os.path.join('locness', file)
         for file in os.listdir('locness') if file.endswith('.txt')]

In [10]:
texts = []
for file in files:
    with open(file, 'r') as f:
        cont = f.read()
    if '<ICLE' in cont:
        parts = ['<ICLE' + part for part in cont.split('<ICLE')]
    else:
        parts = re.split(r'\n{3,}', cont)
    for part in parts:
        code = part.strip().split('\n')[0]
        text = part.replace(code, '').strip()
        if code and text:
            texts.append({'filename': file,
                          'code': code,
                          'text': text})

In [11]:
data = pd.DataFrame(texts)
print(data.shape)
data.head()

(412, 3)


Unnamed: 0,filename,code,text
0,locness\alevels1.txt,Transport 01,The basic dilema facing the UK's rail and road...
1,locness\alevels1.txt,Transport 02,Traffic jams are becoming larger and more freq...
2,locness\alevels1.txt,Transport 03,As transport has advanced over the past 200 ye...
3,locness\alevels1.txt,Transport 04,The problems facing road transport are numerou...
4,locness\alevels1.txt,Transport 05,The number of cars in the United Kingdom has g...


In [12]:
meta = pd.read_excel('locness/locness_meta.xlsx')
print(meta.shape)
meta.head()

(412, 9)


Unnamed: 0,filename,code,language,speaker_type,dialect,topic,task_type,level,location
0,locness\alevels1.txt,Transport 01,english,L1,british,Transport,argumentative,a-level,
1,locness\alevels1.txt,Transport 02,english,L1,british,Transport,argumentative,a-level,
2,locness\alevels1.txt,Transport 03,english,L1,british,Transport,argumentative,a-level,
3,locness\alevels1.txt,Transport 04,english,L1,british,Transport,argumentative,a-level,
4,locness\alevels1.txt,Transport 05,english,L1,british,Transport,argumentative,a-level,


In [13]:
data = data.merge(meta, how='left', on=['filename', 'code'])
data.replace({'': np.nan}, inplace=True)
print(data.shape)
data.head(3)

(412, 10)


Unnamed: 0,filename,code,text,language,speaker_type,dialect,topic,task_type,level,location
0,locness\alevels1.txt,Transport 01,The basic dilema facing the UK's rail and road...,english,L1,british,Transport,argumentative,a-level,
1,locness\alevels1.txt,Transport 02,Traffic jams are becoming larger and more freq...,english,L1,british,Transport,argumentative,a-level,
2,locness\alevels1.txt,Transport 03,As transport has advanced over the past 200 ye...,english,L1,british,Transport,argumentative,a-level,


In [14]:
data['text'] = data['text'].map(implode_spaces)
data = data[data['text'] != 'lost']
print(data.shape)

(411, 10)


In [15]:
data.to_json('locness.json', orient='records', force_ascii=False, indent=4)

## CoRST / КРУТ
* Corpus of Russian Student Texts / Корпус Русских Учебных Текстов ([Source](http://web-corpora.net/learner_corpus))
* L1 Russian
* academic
* annotated with MyStem, no error annotation
* ✔ clean meta

In [None]:
meta = pd.read_csv('corst/annotator_document.csv', sep='\t')
print(meta.shape)
meta.head(3)

In [None]:
anno = pd.read_csv('corst/annotator_sentence.csv', sep='\t')
print(anno.shape)
anno.head(5)

In [None]:
meta.rename(columns={'id': 'doc_id'}, inplace=True)
anno.rename(columns={'id': 'sent_id', 'doc_id_id': 'doc_id'}, inplace=True)

In [None]:
crut = anno.merge(meta, how='left', on='doc_id')
print(crut.shape)
crut.head(3)

In [None]:
crut.replace({'': np.nan}, inplace=True)

In [None]:
crut = crut[['doc_id', 'text', 'created', 'body', 'author',
             'date1', 'date2', 'genre', 'gender', 'major',
             'course', 'term', 'module', 'domain',
             'university', 'words', 'sentences']]

In [None]:
crut = crut.groupby('doc_id').agg(list)

In [None]:
crut_cols = ['created', 'body', 'author', 'date1', 'date2',
             'genre', 'gender', 'major', 'course', 'term',
             'module', 'domain', 'university', 'words', 'sentences']

for col in crut_cols:
    crut[col] = crut[col].map(get_unique)

In [None]:
crut.dropna(how='all', axis=0, inplace=True)
crut.dropna(how='all', axis=1, inplace=True)
print(crut.shape)

In [None]:
def get_text_from_body(text, body):
    if body != 'loaded from xml':
        return implode_spaces(body)
    return implode_spaces(' '.join(text))

In [None]:
crut['text'] = crut.apply(lambda x: get_text_from_body(x['text'], x['body']), axis=1)

In [None]:
def map_hse(text):
    if pd.isnull(text):
        return np.nan
    if 'вшэ' in text.lower():
        return 'НИУ ВШЭ'
    return text.strip()

In [None]:
crut.replace({'gender': {'ж': 'f', 'м': 'm'}}, inplace=True)

crut['author'] = crut['author'].map(capitalize_names)
crut['genre'] = crut['genre'].map(lower_text)
crut['course'] = crut['course'].map(lower_text)
crut['domain'] = crut['domain'].map(lower_text)
crut['university'] = crut['university'].map(map_hse)

crut['created'] = pd.to_datetime(crut['created'], format='%Y-%m-%d %H:%M:%S')
crut['date1'] = pd.to_datetime(crut['date1'], format='%Y').dt.year
crut['date2'] = pd.to_datetime(crut['date2'], format='%Y', errors='coerce').dt.year

crut['language'] = 'russian'
crut['speaker_type'] = 'L1'

In [None]:
crut.drop(columns=['body'], inplace=True)
crut.drop_duplicates(inplace=True)
crut.reset_index(inplace=True)
print(crut.shape)
crut.head(3)

In [None]:
crut.to_json('corst.json', orient='records', force_ascii=False, indent=4)

## RULEC
* Russian Learner Corpus of Academic Writing ([Source](http://www.web-corpora.net/RLC/rulec))
* L2 Russian, longitudinal
* academic
* no annotation
* ✔ clean meta

!!! group work is excluded

In [9]:
files = walk_corpus('rulec')

In [11]:
content = []
for file in tqdm(files):
    if not file.startswith('rulec\\Group work'):
        with open(file, 'r', encoding='utf-16') as f:
            cont = f.read()
            cont = cont.replace('Mode: individual\n', 'Mode: individual>\n')
            text = cont.strip()
            try:
                meta = re.search(r'<.+?>', cont, flags=re.DOTALL).group().strip()
                if meta.startswith('<Student') or meta.startswith('<Name'):
                    text = cont.replace(meta, '').strip()
                else:
                    meta = ''
            except AttributeError:
                print(file)
                continue
            content.append({'filename': os.path.basename(file),
                            'meta': meta,
                            'text': text})

  0%|          | 0/2477 [00:00<?, ?it/s]

In [12]:
data = pd.DataFrame(content)
print(data.shape)
data.head(3)

(2461, 3)


Unnamed: 0,filename,meta,text
0,Anna_HL_2009-2010_Week_19_2_paragraph+_descrip...,<Student: Anna\nGender: F\nLanguage background...,<Представьте термин из области вашей специальн...
1,Anna_HL_AL_2009-2010_Week_12_1_paragraph_summa...,<Student: Anna\nGender: F\nLanguage background...,Автор статьи излагает положительные и отрицате...
2,Anna_HL_AL_2009-2010_Week_12_2_paragraph_suppo...,<Student: Anna\nGender: F\nLanguage background...,"Я считаю, что Сталин любил этот стил архитекту..."


In [13]:
def parse_meta(meta):
    if not meta:
        return {}
    meta = meta.strip('<>')
    parts = meta.split('\n')
    split_meta = {}
    for part in parts:
        if not part:
            continue
        if part.startswith('Course '):
            part = part.replace('Course ', 'Course: ')
        if part.startswith('Level '):
            part = part.replace('Level ', 'Level: ')
        part = part.replace('Name:', 'Student:').replace('Wee:', 'Week:').replace(
            'Term:', 'Week:').replace('Term and Week:', 'Week:').replace(
            'Text typef:', 'Text type:').replace('Format:', 'Text type:')
        part = re.sub('^Type:', 'Time:', part)
        try:
            tag = part.split(':')[0].strip()
            cont = part.replace(tag, '').strip(': ')
        except ValueError:
            print(meta)
            continue
        split_meta[tag.lower()] = cont.strip()
    return split_meta

In [14]:
records = data.to_dict(orient='records')
for row in records:
    row.update(parse_meta(row['meta']))

In [15]:
data = pd.DataFrame(records)
data.replace({'': np.nan}, inplace=True)
data['text'] = data['text'].map(replace_nbsp)
data['text'] = data['text'].map(implode_spaces)

In [16]:
def unify_year(year):
    year = year.replace('–', '-').replace(' ', '').replace('\\', '')
    if not '-20' in year:
        year = year.replace('-', '-20')
    return year

In [17]:
data['year'] = data['year'].map(unify_year)

In [18]:
def unify_mode(mode):
    if pd.isnull(mode):
        return mode
    return mode.lower()

In [19]:
data['mode'] = data['mode'].map(unify_mode)

In [20]:
def unify_type(text_type):
    text_type = re.sub(r'pargraph|paragrpah|paragrapah|pararaph', 'paragraph', text_type)
    text_type = re.sub(r'sentnece|sentece', 'sentence', text_type)
    if 'list' in text_type:
        return 'list'
    if 'paragraph' in text_type:
        return 'paragraph'
    if 'sentence' in text_type:
        return 'sentence'
    return text_type

In [21]:
data['text type'] = data['text type'].map(unify_type)

In [22]:
def unify_function(func):
    func = re.sub(r'argumenation|argumentaiton|argumentative', 'argumentation', func)
    func = re.sub(r'defintiion|definitions|defintion|deinition', 'definition', func)
    func = func.replace('exository', 'expository').replace(
        'exersice', 'exercise').replace('opinioin', 'opinion').replace(
        'sumamry', 'summary')
    if func.startswith('argumentation'):
        return 'argumentation'
    if func.startswith('definition'):
        return 'definition'
    if func.startswith('compare'):
        return 'compare/contrast'
    if func.startswith('description'):
        return 'description'
    if func.startswith('expository'):
        return 'expository'
    if func.startswith('research paper'):
        return 'research paper'
    if func.startswith('supported opinion'):
        return 'supported opinion'
    if func.startswith('summary'):
        return 'summary'
    if func.startswith('plan'):
        return 'plan'
    if func.startswith('question'):
        return 'question'
    if 'notes' in func:
        return 'notes'
    return func

In [23]:
data['function'] = data['function'].map(unify_function)

In [24]:
def unify_time(timed):
    timed = timed.lower()
    if 'non' in timed:
        return 'non-timed'
    dur = timed.replace('(timed)', '').replace('timed', '').strip(' )(').replace('min', ' min')
    dur = re.sub(r'\s+', ' ', dur).replace(' - ', '-')
    if not dur:
        return 'timed'
    return f'timed, {dur}'

In [25]:
data['time'] = data['time'].map(unify_time)

In [26]:
data['gender'] = data['gender'].map(lower_text)
data['course'] = data['course'].map(capitalize_names)
data['week'] = data.apply(lambda x: x['week'].replace('-', '_').strip(), axis=1)

data['language'] = 'russian'
data.rename(columns={'language background': 'speaker_type'}, inplace=True)

In [27]:
data.drop(columns=['meta'], inplace=True)
dedupl = list(data.columns).remove('filename')
data.drop_duplicates(dedupl, inplace=True)
print(data.shape)
data.head(3)

(2461, 14)


Unnamed: 0,filename,text,student,gender,speaker_type,level,year,course,week,text type,function,time,mode,language
0,Anna_HL_2009-2010_Week_19_2_paragraph+_descrip...,<Представьте термин из области вашей специаль...,Anna,f,HL,AL,2009-2010,Russian In The Major,19_2,paragraph,description,non-timed,individual,russian
1,Anna_HL_AL_2009-2010_Week_12_1_paragraph_summa...,Автор статьи излагает положительные и отрицате...,Anna,f,HL,AL,2009-2010,American Studies,12_1,paragraph,summary,non-timed,individual,russian
2,Anna_HL_AL_2009-2010_Week_12_2_paragraph_suppo...,"Я считаю, что Сталин любил этот стил архитекту...",Anna,f,HL,AL,2009-2010,European Studies,12_2,paragraph,supported opinion,"timed, 10 min",individual,russian


In [28]:
data['task'] = data['text'].map(lambda x: re.search(r'^<.*?>\n', x).group()
                                if re.match(r'^<.*?>\n', x) else np.nan)

In [29]:
recs = data.to_dict(orient='records')
for rec in recs:
    if pd.notnull(rec['task']):
        rec['text'] = rec['text'].replace(rec['task'], '').strip()
data = pd.DataFrame(recs)

In [30]:
data.to_json('rulec.json', orient='records', force_ascii=False, indent=4)

## REALEC
* Russian Error-Annotated Learner English Corpus ([Source](https://realec.org/))
* L2 English
* academic / argumentative
* error-annotated
* ✔ clean meta

In [None]:
files = walk_corpus('realec')

In [None]:
seen_files = set()
content = []
for file in tqdm(files):
    no_ext = os.path.splitext(file)[0]
    if no_ext in seen_files:
        continue
    doc = {'filename': os.path.basename(file)}
    for ext in ['.ann', '.json', '.txt']:
        try:
            with open(no_ext+ext, 'r', encoding='utf-8') as f:
                doc[ext] = f.read()
        except FileNotFoundError:
            pass
    seen_files.update({no_ext})
    content.append(doc)

In [None]:
data = pd.DataFrame(content)
print(data.shape)
data.head()

In [None]:
data.dropna(subset=['.ann', '.txt'], how='all', inplace=True)
print(data.shape)
data.drop_duplicates(subset=['.json', '.ann', '.txt'], inplace=True)
print(data.shape)

In [None]:
def json_from_string(json_str):
    if not json_str or pd.isnull(json_str):
        return np.nan
    return json.loads(json_str)

In [None]:
data['.json'] = data['.json'].map(json_from_string)

In [None]:
recs = data.to_dict(orient='records')
for rec in recs:
    if not (pd.isnull(rec['.json'])):
        rec.update(rec['.json'])

In [None]:
data = pd.DataFrame(recs)
data.replace({'': np.nan, 'None': np.nan}, inplace=True)
data.dropna(how='all', axis=0, inplace=True)
data.dropna(how='all', axis=1, inplace=True)
print(data.shape)
data.head(3)

In [None]:
for col in ['sex', 'ielts', 'work_type', 'text_type']:
    data[col] = data[col].map(lower_text)

In [None]:
def get_first_symbol(text):
    if pd.isnull(text):
        return np.nan
    return text[0]

In [None]:
data.replace({'sex': {'&#1100;': 'm', 'а': 'f'}}, inplace=True)
data['sex'] = data['sex'].map(get_first_symbol)

In [None]:
def get_bool_from_str(text):
    if pd.isnull(text):
        return np.nan
    if 't' in text or 'yes' in text:
        return True
    return False

In [None]:
data['ielts'] = data['ielts'].map(get_bool_from_str)

In [None]:
data.replace({'text_type': {'graph descritpion': 'graph description'}}, inplace=True)
data.replace({'CEFR_level': {'B+': np.nan}}, inplace=True)
data.replace({'date': {'30.03': '30.03.2016'}}, inplace=True)

In [None]:
data['date'] = pd.to_datetime(data['date'], format='mixed', dayfirst=True)

data.loc[data['study_year'] == '2016', 'year'] = '2016'
data.loc[data['study_year'] == '2017', 'year'] = '2017'
data.loc[data['study_year'].isin({'2016', '2017'}), 'study_year'] = np.nan

data['year'] = pd.to_datetime(data['year'], format='%Y').dt.year

In [None]:
def modify_study_year(str_year):
    if pd.isnull(str_year):
        return np.nan
    if isinstance(str_year, str):
        try:
            return int(str_year.strip())
        except ValueError:
            pass
    return str_year

In [None]:
data['study_year'] = data['study_year'].map(modify_study_year)
data['ann_checked'] = pd.to_numeric(data['ann_checked'])

In [None]:
def convert_marks(value, task):
    if pd.isnull(value):
        return np.nan
    if isinstance(value, str):
        value = value.replace(',', '.').strip('%').strip()
    try:
        num_val = float(value)
        if num_val < 10:
            num_val *= 10
    except ValueError:
        num_val = value
        if '/' in value:
            vals = value.split('/')
            if task == 'graph description':
                num_val = float(vals[0])
            else:
                num_val = float(vals[1])
        elif '(overall)' in value:
            try:
                num_val = float(value.replace('(overall)', '').strip())
            except ValueError:
                value = re.sub(r'\([^0567]+?\)', '', value).strip()
                if ' и ' in value:
                    vals = value.split(' и ')
                    if task == 'graph description':
                        num_val = float(vals[0])
                    else:
                        num_val = float(vals[1])
                elif '(' not in value:
                    num_val = float(value)
                else:
                    vals = value.split('(')
                    if task == 'graph description':
                        num_val = float(vals[0])
                    else:
                        num_val = float(vals[1].strip(')'))
    return num_val

In [None]:
data['mark'] = data.apply(lambda x: convert_marks(x['mark'], x['text_type']), axis=1)

data['language'] = 'english'
data['speaker_type'] = 'L2'

data.rename(columns={'.ann': 'annotation', '.txt': 'text'}, inplace=True)
data.drop(columns=['.json', 'task_id', 'department'], inplace=True)

print(data.shape)
data.head(3)

In [None]:
data.to_json('realec.json', orient='records', force_ascii=False, indent=4)

## RLC
* Russian Learner Corpus ([Source](http://web-corpora.net/RLC/))
* L2 Russian
* informal
* error-annotated
* ✔ clean meta

`documents.csv`
- **id:** The document ID.
- **subcorpus:** The subcorpus of RLC containing the document.
- **native:** The native laguage of the author.
- **language_background:** L2 speaker (FL) or heritage speaker (HL).
- **level:** Language level of the author.
- **words:** The number of words in the document.
- **sentences:** The number of sentences in the document.

In [10]:
docs = pd.read_csv('rlc/documents.csv')
print(docs.shape)
docs.head()

(2004, 7)


Unnamed: 0,id,subcorpus,native,language_background,level,words,sentences
0,1,RULEC,eng,HL,AM,431,22
1,3,RULEC,eng,HL,AM,245,17
2,5,RULEC,eng,FL,AM,472,22
3,6,RULEC,eng,FL,IH,319,24
4,7,RULEC,eng,HL,AL,44,2


`sentences.csv`
- **id:** The sentence ID of the form **document_id**XXX, where XXX is the three-digit number of the sentence within the document.
- **document_id:** The ID of the document containing the sentence.
- **sentence_index:** The number of the sentence within the document.
- **text:** The original sentence.
- **corrected:** The corrected sentence.
- **status:** _needs correction_ if it is known that the corrected sentence is not quite right; empty, otherwise.

In [11]:
sent = pd.read_csv('rlc/sentences.csv')
print(sent.shape)
sent.head()

(31519, 6)


Unnamed: 0,id,document_id,sentence_index,text,corrected,status
0,1001,1,1,Загрязнение тяжелыми металлами Дальнегорского ...,Загрязнение тяжелыми металлами Дальнегорского ...,
1,1002,1,2,Одной из самых главных экологических проблем н...,Одной из самых главных экологических проблем н...,
2,1003,1,3,Эта проблема особеннo характерна для тех местн...,Эта проблема особеннo характерна для тех местн...,
3,1004,1,4,Согласно проведенным исследованиям Тихоокеанск...,Согласно проведенным исследованиям Тихоокеанск...,
4,1005,1,5,В ходе исследования было выявлено высокое соде...,В ходе исследования было выявлено высокое соде...,


`annotations.csv`
- **id:** The error annotation ID.
- **sentence_id:** The ID of the sentence to which the annotation applies.
- **tag:** The error type.
- **quote:** The fragment containg an error.
- **correction:** The corrected fragment.
- **start:** The start offset of the original fragment within the sentence (indices refer to spaces between tokens; the start of the sentence has zero index).
- **end:** The end offset of the original fragment within the sentence (indices refer to spaces between tokens; the start of the sentence has zero index).
- **annotation_source:** _manual_ if annotation is entered by a person; _rlc-errant_ if the entire sentence was corrected by a person, but the edits were automatically extracted and annotated by RLC-ERRANT.

In [12]:
anno = pd.read_csv('rlc/annotations.csv')
print(anno.shape)
anno.head()

(41410, 8)


Unnamed: 0,id,sentence_id,tag,quote,correction,start,end,annotation_source
0,7944,1002,ortho,окружющей,окружающей,12,13,manual
1,101216,1006,agrcase,промышленным,промышленных,4,5,manual
2,18818,1009,syntax,воздействую,воздействуя,12,13,manual
3,119778,1009,"ortho,altern",желудожно-кишечный,желудочно-кишечный,17,18,manual
4,18819,1012,"syntax,transfer",это,,7,8,manual


`rlc_test.csv` contains a small separate dataset consisting of individual sentences and their annotated corrections. Each line corresponds to a single edit.
- **text_orig:** The original sentence.
- **text_cor:** The corrected sentence.
- **quote:** The fragment containg an error.
- **correction:** The corrected fragment.
- **tag:** The error type.

In [13]:
test = pd.read_csv('rlc/rlc_test.csv', sep=';')
print(test.shape)
test.head()

(555, 5)


Unnamed: 0,text_orig,text_cor,quote,correction,tag
0,Друг - это тот человек который прямо скажет мн...,"Друг - это тот человек, который прямо скажет м...",,",",Syntax Punct
1,Друг - это тот человек который прямо скажет мн...,"Друг - это тот человек, который прямо скажет м...",неправд,неправ,Morph Deriv
2,Друг - это тот человек который прямо скажет мн...,"Друг - это тот человек, который прямо скажет м...",дальшее,дальше,Com
3,Мама готовить вкусно и я учился у нее.,Мама готовит вкусно и я учился у нее.,готовить,готовит,AgrPers
4,"Там мы говорем с друзами, делаем домиини здани...","Там мы говорим с друзьями, делаем домашние зад...",говорем,говорим,Infl


### From sentences to whole documents with metadata

In [14]:
sent['corrected'].fillna(sent['text'], inplace=True)

In [15]:
gsent = sent.groupby('document_id').agg(
    {'text': lambda x: ' '.join(x),
     'corrected': lambda x: ' '.join(x),
     'status': set}).reset_index()
print(gsent.shape)
gsent.head()

(2004, 4)


Unnamed: 0,document_id,text,corrected,status
0,1,Загрязнение тяжелыми металлами Дальнегорского ...,Загрязнение тяжелыми металлами Дальнегорского ...,{nan}
1,3,Директору магазина « Адидас» М. И. Васильченко...,Директору магазина « Адидас» М. И. Васильченко...,{nan}
2,5,"Вывод. Спасибо, ребята, за хорошие ответы. Я м...","Вывод. Спасибо, ребята, за хорошие ответы. Я м...",{nan}
3,6,Записи. Первая проблема- что производить? вто...,Записи. Первая проблема - что производить? Вт...,{nan}
4,7,Стали строил монументальные здания чтобы эти з...,"Сталин строил монументальные здания, чтобы эти...",{nan}


In [16]:
data = gsent.merge(docs, how='left', left_on='document_id', right_on='id')
data['status'] = data['status'].map(lambda x: 'needs correction' if 'needs correction' in x else np.nan)
data.drop(columns=['id'], inplace=True)
data['language'] = 'russian'
data['speaker_type'] = 'L2'

In [17]:
def extract_prompts(text):
    prompt = ''
    if re.search(r'<[^a-z]+?>>?', text):
        prompts = re.findall(r'<[^a-z]+?>>?', text)
        for found in prompts:
            text = text.replace(found, '')
        prompt += '\n'.join(prompts)
    if re.match(r'[^<]{1,300}?>', text):
        prompt += '\n'
        prompt += re.search(r'[^<]{1,300}?>', text).group()
        text = re.sub(r'[^<]{1,300}?>', '', text)
    if prompt:
        prompt = prompt.strip()
    else:
        prompt = np.nan
    return implode_spaces(text).strip(), prompt

In [18]:
data[['text', 'prompt']] = data.apply(lambda x: extract_prompts(x['text']), axis=1, result_type='expand')

In [19]:
print(data.shape)
data.head(3)

(2004, 13)


Unnamed: 0,document_id,text,corrected,status,subcorpus,native,language_background,level,words,sentences,language,speaker_type,prompt
0,1,Загрязнение тяжелыми металлами Дальнегорского ...,Загрязнение тяжелыми металлами Дальнегорского ...,,RULEC,eng,HL,AM,431,22,russian,L2,
1,3,Директору магазина « Адидас» М. И. Васильченко...,Директору магазина « Адидас» М. И. Васильченко...,,RULEC,eng,HL,AM,245,17,russian,L2,
2,5,"Вывод. Спасибо, ребята, за хорошие ответы. Я м...","Вывод. Спасибо, ребята, за хорошие ответы. Я м...",,RULEC,eng,FL,AM,472,22,russian,L2,


In [20]:
data.to_json('rlc.json', orient='records', force_ascii=False, indent=4)

## ACTR
* Corpus of the American Council of Teachers of Russian
* L2 Russian
* informal
* error-annotated
* ✘ clean meta

In [9]:
files = walk_corpus('actr')

# for file in tqdm(files):
#     if file.endswith('doc'):
#         convert(file, file.replace('.doc', '.docx'))
# files = walk_corpus('actr')

len(files)

1761

In [10]:
content = []
for file in tqdm(files):
    if re.search(r'\.(xlsx|pdf|doc|gtx|xhtml)$', file):
        continue
    if (os.path.basename(file).startswith('_') or 'Lexical Richness' in file
            or 'Ties_' in file or 'EC12_A3_2253_PROBLEM' in file or 'ECXX_NS_0112' in file
            or 'EC12_A3_2240_PROBLEM' in file or 'EC12_A4_2245_PROBLEM' in file
            or 'EC12_B1_9354' in file or 'marked' in file):
        continue
    cont = None
    enc = np.nan
    if '.docx' in file:
        cont = docx2txt.process(file)
    elif '.txt' in file:
        with open(file, 'rb') as f:
            cont = f.read()
            enc = chardet.detect(cont)['encoding'].lower().replace('windows-', 'cp')
        with open(file, 'r', encoding=enc) as f:
            cont = f.read()
    if cont:
        content.append({'filename': unidecode(file).replace('ES', 'EC'),
                        'path': file,
                        'encoding': enc,
                        'content': cont})
len(content)

  0%|          | 0/1761 [00:00<?, ?it/s]

1374

In [11]:
ignore_files = {
    'actr\\EC12_B1_txt_raw\\EC12_B1_3642_PROBLEM_FORMATTING.txt',
    'actr\\EC12_B1_txt_raw_OLDER VERSION\\EC12_B1_3642.txt',
    'actr\\EC12_B1_txt_raw_OLDER VERSION_without_tags\\EC12_B1_3642.txt',
    'actr\\EC12_B3_doc\\EC12_B3_0165_copy.docx',
    'actr\\EC12_A3_txt\\EC12_A3_PROBLEM\\EC12_A3_3085_PROBLEM.txt',
    'actr\\EC12_A3_txt\\EC12_A3_3085_PROBLEM.txt',
    'actr\\EC12_A4_txt_tagged\\EC12_A4_0699_tagged.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_2223_PROBLEM.txt',
    'actr\EC12_B3_txt_raw\EC12_B3_0769_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_0145.txt',
    'actr\EC12_A4_txt_tagged\EC12_A4_0160_raw_NEEDS TAGGING.txt',
    'actr\EC12_A4_txt_tagged\EC12_A4_0161_raw_NEEDS_TAGGING.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_0256.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_2469.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_2882.txt',
    'actr\EC12_A4_txt_tagged\EC12_A4_3129_raw_NEEDS TAGGING.txt',
    'actr\EC12_A4_txt_tagged\EC12_A4_3159_raw_NEEDS TAGGING.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_6378.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_5791.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_5306.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_4953.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_4443.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_4133.txt',
    'actr\EC12_A4_txt_tagged\EC12_A4_3342_raw_NEEDS_TAGGING.txt',
    'actr\Level_4\EC12_B2_7267_tagged_ST.txt',
    'actr\Level_3\EC12_B2_8635_tagged_ST.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_4133.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_4443.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_4953.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_5306.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_5791.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_6378.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_0145.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_1126.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_1126_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_1711.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_1711.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_1827.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_1827_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_1953.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_2136.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_2223.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_2421.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_2469_raw.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_2495_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_2495.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_2882.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_2967_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_2967.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_2972.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_2972_raw.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_3089_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_3089.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_3091_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_3091.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_3544_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_3544.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_3769.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_3769.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_3991.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_4129.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_4129.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_4803.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_4803_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_4930.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_4930_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_5000.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_5000_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_5202.txt',
    'actr\EC12_B1_txt_raw\EC12-B1-5202_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_5202.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_6143.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_6143.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_6409.txt',
    'actr\EC12_B1_txt_raw\EC12_B1_6409_raw.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION_without_tags\EC12_B1_3300.txt',
    'actr\EC12_B1_txt_raw_OLDER VERSION\EC12_B1_3300.txt',
    'actr\EC12_NS_txt\EC12_NS_0010.txt',
    'actr\EC12_B1_txt_STANDARDIZED_with_tags\EC12-B1-1086_tagged.txt',
    'actr\EC12_B2_txt_raw\EC12_B2_2797_raw.txt',
    'actr\EC12_B1_txt_STANDARDIZED_with_tags\EC12_B1_3642_tagged.txt'}

In [12]:
data = pd.DataFrame(content)
data = data[~data['filename'].isin(ignore_files)]
data = data[~data['path'].isin(ignore_files)]
data['content'] = data['content'].map(replace_nbsp)
data['content'] = data['content'].map(fight_yot)

### Split essays

In [13]:
def split_students(text):
    parts = iter([part for part in re.split(r'(Студент \d+ ?\n)', text) if part])
    texts = [el + next(parts, '') for el in parts]
    return texts

In [14]:
# делим тексты в файле Эссе_Студенты_эконом
data.loc[data['path'] == 'actr\Эссе_Студенты_эконом.docx', 'content'] = data.loc[
    data['path'] == 'actr\Эссе_Студенты_эконом.docx']['content'].map(split_students)

data = data.explode('content')
data.reset_index(drop=True, inplace=True)

# убираем лишние пробелы
data['content'] = data['content'].map(implode_spaces)
print(data.shape)
data.head()

(1336, 4)


Unnamed: 0,filename,path,encoding,content
0,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 1\nвозраст: 17\nпол: ж\nстрана обучени...
1,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 2\nвозраст: 18\nпол: ж\nстрана обучени...
2,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 3\nвозраст: 17\nпол: ж\nстрана обучени...
3,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 4\nвозраст: 18\nпол: ж\nместо обучения...
4,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 5\nВозраст: 18\nПол: м\nМесто обучения...


### Isolate meta-information

In [15]:
def isolate_meta(text):

    parts = []
    for part in text.split('\n'):
        if re.match(r'<.{0,50}>$', part.strip()) and 'deleted' not in part:
            parts.append(part)
    if parts:
        return '\n'.join(parts)

    text_id = re.match(r'<[A-ZА-Я0-9_ -]+>', text)
    if text_id:
        return text_id.group()

    text_id = re.match(r'HS\d-\d+', text)
    if text_id:
        return text_id.group()

    text_id = re.match(r'NS-\d+', text)
    if text_id:
        return text.split('\n')[0]

    text_id = re.match(r'<Student: \d+.*?>', text, flags=re.DOTALL)
    if text_id:
        return text_id.group()

    text_id = re.match(r'[A-Z]\d( |_)\d{4}', text, flags=re.DOTALL)
    if text_id:
        return text_id.group()

    text_id = re.match(r'[A-Za-z. ]+\n', text)
    if text_id:
        parts = []
        for part in text.split('\n'):
            if re.search(r'[А-Яа-яЁё]', part):
                break
            parts.append(part)
        return '\n'.join(parts)

    text_id = re.match(r'Студент \d+\n', text)
    if text_id:
        parts = []
        for part in text.split('\n'):
            if not (re.match('студент|возраст|пол|страна', part.lower()) or
                    part.lower().startswith('место обучения')):
                break
            parts.append(part)
        return '\n'.join(parts)

    text_id = re.match(r'[А-Яа-яЁёй ,.0-9]{0,50}\n', text)
    if text_id and re.search(r'\d+', text.split('\n')[0]):
        return text_id.group()

    return ''

In [16]:
data['meta'] = data['content'].map(isolate_meta)
data['len_meta'] = data['meta'].map(len)

In [17]:
def remove_meta(text, meta):
    meta_parts = meta.split('\n')
    for part in meta_parts:
        text = text.replace(part.strip(), '')
    return text.strip()

In [18]:
data['text'] = data.apply(lambda x: remove_meta(x['content'], x['meta']), axis=1)
data['text'] = data['text'].map(implode_spaces)
data.drop(columns=['content'], inplace=True)
data.replace({'': np.nan}, inplace=True)

In [19]:
def format_meta(filename, meta):
    # если метаинформации нет, возвращаем как есть
    if pd.isnull(meta):
        return np.nan

    # проверяем, является ли метаинформация названием файла
    path = os.path.splitext(os.path.basename(filename))[0].lower().replace('-', '_')
    path = unidecode(path).replace('es12', 'ec12')
    form_meta = meta.split('\n')[0].lower().replace(' ', '_').replace(
        '-', '_').strip('<>').replace('ec_12', 'ec12')
    form_meta = unidecode(form_meta)
    if form_meta in path:
        if '\n' in meta:
            # удаляем первую строку метаинформации
            return '\n'.join(meta.split('\n')[1:]).strip()
        return np.nan

    # убираем название файла из метаинформации
    try:
        text_id = re.search(r'ns_\d+', path).group()
        return meta.replace(text_id.replace('ns_', 'NS-'), '').strip()
    except AttributeError:
        return meta.strip()

In [20]:
data['meta_new'] = data.apply(lambda x: format_meta(x['filename'], x['meta']), axis=1)

In [21]:
def split_meta(meta):

    # пустая метаинформация
    if pd.isnull(meta):
        return {}

    # метаинформация на русском
    if re.match(r'Студент \d+', meta):
        meta = meta.replace('Возраст ', 'Возраст: ').replace('\nпол ', '\nпол: ').replace(
            '\nПол ', '\nПол: ').replace('Страна обучения ', 'Страна обучения –').replace(
            '––', '–').replace('–-', '–')
        mets = {'speaker_id': re.search(r'\d+', meta).group()}
        for part in meta.split('\n')[1:]:
            feat, val = re.split(r'[:–-]', part)
            feat = feat.lower().strip()
            if 'страна' in feat or 'место' in feat:
                feat = 'country'
            mets[feat] = val.strip()
        return mets

    # метаинформация на английском
    if re.match(r'<Student', meta):
        mets = {'speaker_id': re.search(r'\d+', meta).group()}
        for part in meta.split('\n')[1:]:
            feat, val = part.split(':')
            val = val.strip().strip('>')
            if not val:
                val = np.nan
            mets[feat.lower().strip()] = val
        return mets

    # метаинформация без подписей
    if len(meta.split('\n')) == 4 and '<' not in meta:
        keys = ['name', 'institution', 'background', 'date']
        values = meta.split('\n')
        return dict(zip(keys, values))

    if len(meta.split('\n')) == 5 and '<' not in meta:
        keys = ['name', 'speaker_id', 'institution', 'background', 'date']
        values = meta.split('\n')
        return dict(zip(keys, values))

    # информация о первом языке и другие ремарки
    if re.match(r'^<.+>$', meta, flags=re.DOTALL):
        mets = [m.strip('<>').strip() for m in meta.lower().split('\n')]
        if re.search(r'(first|native) (language|langauge)(.+)', mets[0]):
            lang = re.search(r'(first|native) (language|langauge)(.+)', mets[0]).group(3).strip('?: ')
            return {'native_lang': lang}
        return {'remarks': '; '.join(mets)}

    # то, что осталось
    meta = re.sub(r'(\w+)[^,.0-9](\d{1,2})', '\g<1>, \g<2>', meta)
    mets = {}
    for part in meta.split(','):
        if re.match(r'^[а-яА-ЯёЁ .]+$', part.strip()):
            mets['name'] = part.strip()
        elif 'класс' in part:
            mets['school_grade'] = int(re.search(r'\d+', part).group())
        elif 'г.р.' in part:
            mets['dob'] = part.replace('г.р.', '').strip()
        else:
            mets['age'] = int(re.search(r'\d+', part).group())

    if not mets:
        print(meta)
    return mets

In [22]:
recs = data.to_dict(orient='records')
for rec in recs:
    if pd.notnull(rec['meta_new']):
        rec.update(split_meta(rec['meta_new']))

In [23]:
data = pd.DataFrame(recs)
data.head(3)

Unnamed: 0,filename,path,encoding,meta,len_meta,text,meta_new,speaker_id,возраст,пол,...,prompt,institution,remark,remarks,native_lang,name,school_grade,dob,background,date
0,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 1\nвозраст: 17\nпол: ж\nстрана обучени...,52,Место книги в жизни человека\n(чтение и книги)...,Студент 1\nвозраст: 17\nпол: ж\nстрана обучени...,1,17,ж,...,,,,,,,,,,
1,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 2\nвозраст: 18\nпол: ж\nстрана обучени...,52,"Эссе ""Воспоминания о детстве/школе""\n""Все мы р...",Студент 2\nвозраст: 18\nпол: ж\nстрана обучени...,2,18,ж,...,,,,,,,,,,
2,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,Студент 3\nвозраст: 17\nпол: ж\nстрана обучени...,52,Эссе по теме: «Мой друг»\nПроблема дружбы нере...,Студент 3\nвозраст: 17\nпол: ж\nстрана обучени...,3,17,ж,...,,,,,,,,,,


### Format meta-information

In [24]:
def combine_cols(col1, col2):
    if pd.notnull(col1):
        return col1
    if pd.notnull(col2):
        return col2
    return np.nan

In [25]:
data['gender'] = data.apply(lambda x: combine_cols(x['пол'], x['gender']), axis=1)
data['remarks'] = data.apply(lambda x: combine_cols(x['remark'], x['remarks']), axis=1)
data['age'] = data.apply(lambda x: combine_cols(x['возраст'], x['age']), axis=1)
data['date'] = data.apply(lambda x: combine_cols(x['date'], x['year']), axis=1)
data['background'] = data.apply(lambda x: combine_cols(x['language background'], x['background']), axis=1)

data['age'] = pd.to_numeric(data['age'])
data['dob'] = pd.to_datetime(data['dob'], dayfirst=True)
data['gender'] = data['gender'].map(lower_text)
data.replace({'gender': {'ж': 'f', 'м': 'm'}}, inplace=True)

data.drop(columns=['пол', 'remark', 'возраст', 'year', 'language background', 'meta', 'meta_new'], inplace=True)

In [26]:
def extract_level(filename):

    # уровень указан в названии файла
    if re.search(r'[^A-ZА-Я][ABC]\d', filename):
        levels = [lev.strip('_-').replace('\\', '') for lev in re.findall(r'[^A-ZА-Я][ABC]\d', filename)]
        if len(set(levels)) > 1 and filename.startswith('actr\EC12_B3_doc'):
            return levels[-1]
        return list(set(levels))[0]

    # native speakers
    if 'NS' in filename or filename == 'actr\Esse_Studenty_ekonom.docx':
        return 'NS'
    # heritage speakers
    if re.search(r'Heritage \d+', filename):
        return re.search(r'Heritage \d+', filename).group()

    print(filename)
    return np.nan

In [27]:
data['level_from_path'] = data['filename'].map(extract_level)
data['level'] = data.apply(lambda x: combine_cols(x['level'], x['level_from_path']), axis=1)
data.drop(columns=['level_from_path'], inplace=True)

In [28]:
def extract_speaker(filename):
    basename = os.path.splitext(os.path.basename(filename))[0]
    if re.search(r"^[A-Za-z'._ ]+$", basename):
        return np.nan
    if re.search(r'\d{4}', basename):
        return re.search(r'\d{4}', basename).group()
    return basename

In [29]:
data['speaker_from_file'] = data['filename'].map(extract_speaker)
data['speaker_id'] = data.apply(lambda x: combine_cols(x['speaker_id'], x['speaker_from_file']), axis=1)
data.drop(columns=['speaker_from_file'], inplace=True)
print(data.shape)
data.head(3)

(1336, 19)


Unnamed: 0,filename,path,encoding,len_meta,text,speaker_id,country,gender,level,age,prompt,institution,remarks,native_lang,name,school_grade,dob,background,date
0,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,52,Место книги в жизни человека\n(чтение и книги)...,1,Россия,f,NS,17.0,,,,,,,NaT,,
1,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,52,"Эссе ""Воспоминания о детстве/школе""\n""Все мы р...",2,Россия,f,NS,18.0,,,,,,,NaT,,
2,actr\Esse_Studenty_ekonom.docx,actr\Эссе_Студенты_эконом.docx,,52,Эссе по теме: «Мой друг»\nПроблема дружбы нере...,3,Россия,f,NS,17.0,,,,,,,NaT,,


### Extract annotation tags

In [30]:
def get_prefix(filename):

    # получаем чистое имя файла
    raw = os.path.splitext(os.path.basename(filename))[0].lower()
    basename = re.sub(r'[^0-9a-z]', '_', raw)

    prefix = np.nan
    # если префикс есть в имени, получаем его
    if re.search(r'[a-z]$', basename):
        pref = re.search(r'[a-z_]+$', basename).group()
        if pref.startswith('_'):
            prefix = re.sub(r'_+', '_', pref)
    # иначе получаем префикс из имени папки
    elif re.search(r'_(raw|ST)', filename):
        prefix = re.search(r'_(raw|ST)', filename).group().lower()

    return prefix

In [31]:
def get_raw_filename(filename, prefix):
    raw_name = os.path.splitext(os.path.basename(filename))[0].lower()
    raw_name = re.sub(r'[^0-9a-z]', '_', raw_name)
    raw_name = re.sub(r'_+', '_', raw_name)
    if pd.notnull(prefix):
        return raw_name.replace(prefix, '').strip('_')
    return raw_name

In [32]:
data['prefix'] = data['filename'].map(get_prefix)
data['raw_name'] = data.apply(lambda x: get_raw_filename(x['filename'], x['prefix']), axis=1)
data.replace({'prefix': {'_raw_needs_tagging': '_raw',
                         '_copy': '_raw'}}, inplace=True)

### Deduplication

In [33]:
# убираем полные дубли
data.sort_values(['text', 'filename', 'len_meta'], ascending=[True, False, False], inplace=True)
data.drop_duplicates(subset=['len_meta', 'text', 'speaker_id', 'country',
                             'gender', 'level', 'age', 'prompt', 'institution',
                             'remarks', 'native_lang', 'name', 'school_grade', 'dob',
                             'background', 'date', 'prefix', 'raw_name'],
                     keep='first', inplace=True)
print(data.shape)

(1193, 21)


In [34]:
# убираем одинаковые тексты с одинаковым именем файла и префиксом
# (оставляем те, где больше мета-информации)
data.sort_values(['text', 'raw_name', 'len_meta'], ascending=[True, True, False], inplace=True)
data.drop_duplicates(subset=['text', 'raw_name', 'prefix'], keep='first', inplace=True)
print(data.shape)

(1178, 21)


### Split by speaker type

In [35]:
# эритажные говорящие
heritage = data[data['level'].isin({'Heritage 1', 'Heritage 2', 'Heritage 3'})]
heritage.shape

(183, 21)

In [36]:
# носители
native = data[data['level'].isin({'NS'})]
native.shape

(127, 21)

In [37]:
# изучающие как L2
foreign = data[~data['level'].isin({'Heritage 1', 'Heritage 2', 'Heritage 3', 'NS'})]
foreign.shape

(868, 21)

### Analyze different version of text

In [38]:
def get_corrected_tags(data, tag, tag_corr):
    # ищем, для каких текстов есть и один, и другой тег
    is_corr = data.groupby(['speaker_id', 'raw_name']).apply(
        lambda x: x['prefix'].eq(tag).any() & x['prefix'].eq(tag_corr).any()).reset_index()
    data = data.merge(is_corr, how='left').rename(columns={0: 'corr_tags'})
    data = data[~((data['prefix'] == tag) & (data['corr_tags'] == True))]
    data.reset_index(drop=True, inplace=True)
    data.drop(columns=['corr_tags'], inplace=True)
    print(data.shape)
    return data

In [39]:
foreign = get_corrected_tags(foreign, '_tagged', '_tagged_corrected')
foreign = get_corrected_tags(foreign, '_tagged_st', '_tagged_st_corrected')

(800, 21)
(788, 21)


In [40]:
foreign.replace({'prefix': {'_tagged_corrected': '_tagged',
                            '_tagged_st_corrected': '_tagged_st'}}, inplace=True)

In [41]:
foreign['prefix'] = foreign['prefix'].fillna('_other')

In [42]:
def find_untagged(tag_list):
    if '_other' in tag_list or '_done' in tag_list:
        return True
    return False

In [43]:
temp = foreign.groupby('text').agg(list).reset_index()[['text', 'prefix']]
temp['num_pref'] = temp['prefix'].map(len)
temp['has_other'] = temp['prefix'].map(find_untagged)

In [44]:
texts_with_other = set(temp[(temp['num_pref'] > 1) & (temp['has_other'] == True)]['text'].tolist())
foreign = foreign[~((foreign['text'].isin(texts_with_other)) & ((foreign['prefix'] == '_other') | (foreign['prefix'] == '_done')))]
print(foreign.shape)
foreign.head(3)

(743, 21)


Unnamed: 0,filename,path,encoding,len_meta,text,speaker_id,country,gender,level,age,...,institution,remarks,native_lang,name,school_grade,dob,background,date,prefix,raw_name
0,actr\EC12_B2_txt_raw\EC12_B2_5926_raw.txt,actr\EC12_B2_txt_raw\EC12_B2_5926_raw.txt,utf-16,7,"""""""Что такое Друг""""""\n Я хочу рассказать о том...",5926,,,B2,,...,,,,,,NaT,,,_raw,ec12_b2_5926
1,actr\EC12_B2_txt_raw\EC12_B2_0987_raw.txt,actr\EC12_B2_txt_raw\EC12_B2_0987_raw.txt,utf-16,7,"""""""Что такое друг""""""\n Друг это человек\n кото...",987,,,B2,,...,,,,,,NaT,,,_raw,ec12_b2_0987
2,actr\EC12_B2_txt_raw\EC12_B2_0616_raw.txt,actr\EC12_B2_txt_raw\EC12_B2_0616_raw.txt,utf-16,7,"""«Не имей сто рублей, а имей сто друзей.»""\n ""...",616,,,B2,,...,,,,,,NaT,,,_raw,ec12_b2_0616


In [45]:
foreign.replace({'prefix': {'_other': np.nan}}, inplace=True)

### Deduplication p.2

In [46]:
# concatenate all texts
data = pd.concat([heritage, native, foreign])
data.reset_index(drop=True, inplace=True)
print(data.shape)
data.head(3)

(1053, 21)


Unnamed: 0,filename,path,encoding,len_meta,text,speaker_id,country,gender,level,age,...,institution,remarks,native_lang,name,school_grade,dob,background,date,prefix,raw_name
0,actr\Zhenyas data\Essay Contest Heritage 1\Num...,actr\Zhenyas data\Essay Contest Heritage 1\Num...,,6,"""Место Любимое Моё""\nПрокатившись по многим г...",HS1-50,,,Heritage 1,,...,,,,,,NaT,,,,hs1_50
1,actr\Zhenyas data\Essay Contest Heritage 1\Num...,actr\Zhenyas data\Essay Contest Heritage 1\Num...,,6,"""Человек которогa я люблю""\nИногда я думаю, чт...",HS1-18,,,Heritage 1,,...,,,,,,NaT,,,,hs1_18
2,actr\Zhenyas data\Essay Contest Heritage 1\Sir...,actr\Zhenyas data\Essay Contest Heritage 1\Sir...,,43,"""Человек которогa я люблю""\nИногда я думаю, чт...",,,,Heritage 1,,...,UCLA,,,Sirarpi Mnatskanyan,,NaT,Heritage-1,02/2009,,sirarpi_mnatskanyan


In [47]:
# выделяем одинаковые тексты с разными метаданными в отдельный датафрейм
doubles = data[data.duplicated(subset=['text'], keep=False)].groupby('text').agg(list).reset_index()
data.drop_duplicates(subset=['text'], keep=False, inplace=True)
print(doubles.shape)

(71, 21)


In [48]:
def get_info_from_agg(agg_list):
    agg_list = [itm for itm in list(dict.fromkeys(agg_list)) if pd.notnull(itm)]
    if not agg_list:
        return [np.nan, np.nan, np.nan]
    if len(agg_list) == 1:
        return [agg_list[0], agg_list[0], agg_list[0]]
    if len(agg_list) == 3:
        return agg_list
    agg_list.append(np.nan)
    return agg_list

In [49]:
for col in ['filename', 'path', 'encoding', 'len_meta', 'speaker_id',
            'country', 'gender', 'level', 'age', 'prompt', 'institution',
            'remarks', 'native_lang', 'name', 'school_grade', 'dob',
            'background', 'date', 'prefix', 'raw_name']:
    doubles[col] = doubles[col].map(get_info_from_agg)

doubles = doubles.explode([
    'filename', 'path', 'encoding', 'len_meta', 'speaker_id',
    'country', 'gender', 'level', 'age', 'prompt', 'institution',
    'remarks', 'native_lang', 'name', 'school_grade', 'dob',
    'background', 'date', 'prefix', 'raw_name'])

In [50]:
doubles.sort_values(['text', 'len_meta'], ascending=[True, False], inplace=True)
doubles.drop_duplicates(subset=['text'], keep='first', inplace=True)
doubles.dropna(how='all', axis=0, inplace=True)
doubles.dropna(how='all', axis=1, inplace=True)
print(doubles.shape)
doubles.head(3)

(71, 15)


Unnamed: 0,text,filename,path,encoding,len_meta,speaker_id,country,gender,level,age,institution,name,background,date,raw_name
0,"""Место,которое я люблю""\nВ мире существует мно...",actr\NS_What I love_additional data_unprocesse...,actr\NS_What I love_additional data_unprocesse...,,16,NS-5,,,NS,19.0,,Евгений,,,evgeny
1,"""Человек которогa я люблю""\nИногда я думаю, чт...",actr\Zhenyas data\Essay Contest Heritage 1\Sir...,actr\Zhenyas data\Essay Contest Heritage 1\Sir...,,43,HS1-18,,,Heritage 1,,UCLA,Sirarpi Mnatskanyan,Heritage-1,02/2009,sirarpi_mnatskanyan
2,"Больше всего, я люблю интересные люды, страны,...",actr\Zhenyas data\Essay Contest Heritage 1\Ann...,actr\Zhenyas data\Essay Contest Heritage 1\Ann...,,48,HS1-4,,,Heritage 1,,Columbia University,Anna Kats,Heritage-1,02/2009,anna_kats


In [51]:
data = pd.concat([data, doubles], ignore_index=True, axis=0)
data.sort_index(inplace=True)
data.reset_index(drop=True, inplace=True)
data.drop(columns=['len_meta'], inplace=True)
print(data.shape)
data.head(3)

(979, 20)


Unnamed: 0,filename,path,encoding,text,speaker_id,country,gender,level,age,prompt,institution,remarks,native_lang,name,school_grade,dob,background,date,prefix,raw_name
0,actr\Zhenyas data\Essay Contest Heritage 1\Num...,actr\Zhenyas data\Essay Contest Heritage 1\Num...,,"""Место Любимое Моё""\nПрокатившись по многим г...",HS1-50,,,Heritage 1,,,,,,,,NaT,,,,hs1_50
1,actr\Zhenyas data\Essay Contest Heritage 1\Num...,actr\Zhenyas data\Essay Contest Heritage 1\Num...,,В моей жизни я встречала много разных людей. У...,HS1-30,,,Heritage 1,,,,,,,,NaT,,,,hs1_30
2,actr\Zhenyas data\Essay Contest Heritage 1\Num...,actr\Zhenyas data\Essay Contest Heritage 1\Num...,,"В прошлем году, я провела четыре месяцев в Лон...",HS1-24,,,Heritage 1,,,,,,,,NaT,,,,hs1_24


### Aggregate metadata for texts

In [52]:
heritage = data[data['level'].isin({'Heritage 1', 'Heritage 2', 'Heritage 3'})]
native = data[data['level'].isin({'NS'})]
foreign = data[~data['level'].isin({'Heritage 1', 'Heritage 2', 'Heritage 3', 'NS'})]

In [53]:
foreign = foreign.groupby(['speaker_id', 'raw_name']).agg(list).reset_index()
print(foreign.shape)
foreign.head(3)

(436, 20)


Unnamed: 0,speaker_id,raw_name,filename,path,encoding,text,country,gender,level,age,prompt,institution,remarks,native_lang,name,school_grade,dob,background,date,prefix
0,7,ec12_a1_0007,[actr\EC12_A1_txt\EC12_A1_0007.txt],[actr\EC12_A1_txt\EC12_A1_0007.txt],[utf-16],"[Я опаздеваю. Что такое друг? Падруга сестра, ...",[nan],[nan],[A1],[nan],[essay contest],[College of New Jersey],"[неправильное провописание буквы ""ы""]",[nan],[nan],[nan],[NaT],[L2],[2012],[nan]
1,11,ec12_a1_0011,[actr\EC12_A1_txt\EC12_A1_0011.txt],[actr\EC12_A1_txt\EC12_A1_0011.txt],[utf-16],"[Что такое друг?\nЯ не знал, что замечательный...",[nan],[nan],[A1],[nan],[essay contest],[Columbia University],[nan],[nan],[nan],[nan],[NaT],[L2],[2012],[nan]
2,13,ec12_a4_0013,"[actr\EC12_A4_txt_raw\EC12_A4_0013_raw.txt, ac...","[actr\EC12_A4_txt_raw\EC12_A4_0013_raw.txt, ac...","[utf-8, nan]","[Объяснить, такой “друг"", это так трудно, как ...","[nan, nan]","[nan, nan]","[A4, A4]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[NaT, NaT]","[nan, nan]","[nan, nan]","[_raw, _done]"


In [54]:
def get_info_from_agg2(agg_list):
    agg_list = [itm for itm in list(dict.fromkeys(agg_list)) if pd.notnull(itm)]
    if not agg_list:
        return np.nan
    if len(agg_list) == 1:
        return agg_list[0]
    new_agg = []
    for itm in agg_list:
        new_agg.extend(itm.split('; '))
    agg_list = [itm for itm in list(dict.fromkeys(new_agg)) if pd.notnull(itm)]
    return '; '.join(agg_list)

In [55]:
for col in ['country', 'gender', 'level', 'age', 'prompt',
            'institution', 'remarks', 'native_lang', 'name',
            'school_grade', 'dob', 'background', 'date']:
    foreign[col] = foreign[col].map(get_info_from_agg2)

### Split tagged texts into columns

In [56]:
recs = foreign.to_dict(orient='records')
for rec in recs:
    for i, pref in enumerate(rec['prefix']):
        if pd.isnull(pref):
            pref = '_no_tag'
        rec[pref] = rec['text'][i]
foreign = pd.DataFrame(recs)

In [57]:
foreign.drop(columns=['text', 'prefix', 'filename', 'encoding'], inplace=True)
foreign['path'] = foreign['path'].map(' | '.join)
print(foreign.shape)
foreign.head(3)

(436, 23)


Unnamed: 0,speaker_id,raw_name,path,country,gender,level,age,prompt,institution,remarks,...,dob,background,date,_no_tag,_raw,_done,_tagged,_st,_tagged_st,_problem
0,7,ec12_a1_0007,actr\EC12_A1_txt\EC12_A1_0007.txt,,,A1,,essay contest,College of New Jersey,"неправильное провописание буквы ""ы""",...,,L2,2012.0,"Я опаздеваю. Что такое друг? Падруга сестра, н...",,,,,,
1,11,ec12_a1_0011,actr\EC12_A1_txt\EC12_A1_0011.txt,,,A1,,essay contest,Columbia University,,...,,L2,2012.0,"Что такое друг?\nЯ не знал, что замечательный ...",,,,,,
2,13,ec12_a4_0013,actr\EC12_A4_txt_raw\EC12_A4_0013_raw.txt | ac...,,,A4,,,,,...,,,,,"Объяснить, такой “друг"", это так трудно, как о...","Объяснить, что такой “друг”, это так трудно, к...",,,,


In [58]:
foreign.to_json('actr_FL_versions.json', orient='records', force_ascii=False, indent=4)

### Extract single version for FL texts

In [59]:
recs = foreign.to_dict(orient='records')
for rec in recs:
    if pd.notnull(rec['_tagged']):
        rec['text'] = rec['_tagged']
    elif pd.notnull(rec['_done']):
        rec['text'] = rec['_done']
    elif pd.notnull(rec['_tagged_st']):
        rec['text'] = rec['_tagged_st']
    elif pd.notnull(rec['_st']):
        rec['text'] = rec['_st']
    elif pd.notnull(rec['_raw']):
        rec['text'] = rec['_raw']
    elif pd.notnull(rec['_no_tag']):
        rec['text'] = rec['_no_tag']
    elif pd.notnull(rec['_problem']):
        rec['text'] = rec['_problem']
    else:
        rec['text'] = np.nan

In [60]:
foreign = pd.DataFrame(recs)
foreign.drop(columns=['_no_tag', '_raw', '_done', '_tagged', '_st', '_tagged_st', '_problem'], inplace=True)
print(foreign.shape)
foreign.head(3)

(436, 17)


Unnamed: 0,speaker_id,raw_name,path,country,gender,level,age,prompt,institution,remarks,native_lang,name,school_grade,dob,background,date,text
0,7,ec12_a1_0007,actr\EC12_A1_txt\EC12_A1_0007.txt,,,A1,,essay contest,College of New Jersey,"неправильное провописание буквы ""ы""",,,,,L2,2012.0,"Я опаздеваю. Что такое друг? Падруга сестра, н..."
1,11,ec12_a1_0011,actr\EC12_A1_txt\EC12_A1_0011.txt,,,A1,,essay contest,Columbia University,,,,,,L2,2012.0,"Что такое друг?\nЯ не знал, что замечательный ..."
2,13,ec12_a4_0013,actr\EC12_A4_txt_raw\EC12_A4_0013_raw.txt | ac...,,,A4,,,,,,,,,,,"Объяснить, что такой “друг”, это так трудно, к..."


### Concatenate all tables and clean texts from tags

In [61]:
data = pd.concat([heritage, native, foreign])
data.reset_index(drop=True, inplace=True)
data = data[['path', 'raw_name', 'text', 'speaker_id', 'country',
             'gender', 'level', 'age', 'prompt', 'institution', 'remarks',
             'native_lang', 'name', 'school_grade', 'dob', 'background', 'date']]
data.replace({'date': {'2/2009': '02/2009'}}, inplace=True)
data['language'] = 'russian'
data['speaker_type'] = 'L1'
data.loc[data['background'] == 'L2', 'speaker_type'] = 'L2'
data.loc[data['background'].isin({'Heritage-1', 'Heritage-2', 'Heritage-3'}), 'speaker_type'] = 'HS'
data.drop(columns=['background'], inplace=True)
print(data.shape)
data.head(3)

(672, 18)


  data = pd.concat([heritage, native, foreign])


Unnamed: 0,path,raw_name,text,speaker_id,country,gender,level,age,prompt,institution,remarks,native_lang,name,school_grade,dob,date,language,speaker_type
0,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_50,"""Место Любимое Моё""\nПрокатившись по многим г...",HS1-50,,,Heritage 1,,,,,,,,NaT,,russian,L1
1,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_30,В моей жизни я встречала много разных людей. У...,HS1-30,,,Heritage 1,,,,,,,,NaT,,russian,L1
2,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_24,"В прошлем году, я провела четыре месяцев в Лон...",HS1-24,,,Heritage 1,,,,,,,,NaT,,russian,L1


In [62]:
def remove_tags(value):
    value = re.sub(r'<corrected deleted=[^<]+?>([^<>]*?)</corrected>?', r'\1', value)
    value = re.sub(r'<strange_symb pos=[^<]+?>([^<>]+?)</strange_symb>', r'\1', value)
    value = re.sub(r'<error=[^<]+?>([^<>]+?)</error>', r'\1', value)
    value = re.sub(r'<inserted>([^<>]+?)</inserted>', r'\1', value)
    value = re.sub(r'<allcapital>([^<>]+?)</allcapital>', r'\1'.lower(), value)
    value = re.sub(r'<possible=[^<]+?>([^<>]+?)</possible>', r'\1', value)
    value = re.sub(r'<latincharacter=[^<]+?>([^<>]+?)</latincharacter>', r'\1', value)
    value = re.sub(r'<symbol=[^<]+?>[^<>]+?</symbol>', '', value)
    value = re.sub(r'<нарисована рожица>', '', value)
    value = re.sub(r'<underlined>([^<>]+?)</underlined>', r'\1', value)
    value = re.sub(r'<deleted[^<]*?>[^<>]*?</ ?deleted>', '', value)
    value = re.sub(r'<corrected>[^<>]*?</corrected>', '', value)
    value = re.sub(r'< ?deleted>', '', value)
    value = re.sub(r'<unclear=[^<]+?>[^<>]+?</unclear', '<unclear>', value)
    value = re.sub(r'<(stress mark|diacritic)[^<]+?>', '', value)
    value = value.replace(' unclear ', ' <unclear> ').replace('[unclear]', '<unclear>').replace(
        '<unclear>>', '<unclear>').replace('___ <word missing>', '').replace(
        '<3ем>', 'третьем').replace('<distorted spelling>', '')
    return value

In [63]:
data['text'] = data['text'].map(remove_tags)
data['text'] = data['text'].map(implode_spaces)
data['text'] = data['text'].map(lambda x: translit(x, 'ru'))

In [64]:
print(data.shape)
data.head(3)

(672, 18)


Unnamed: 0,path,raw_name,text,speaker_id,country,gender,level,age,prompt,institution,remarks,native_lang,name,school_grade,dob,date,language,speaker_type
0,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_50,"""Место Любимое Моё""\nПрокатившись по многим г...",HS1-50,,,Heritage 1,,,,,,,,NaT,,russian,L1
1,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_30,В моей жизни я встречала много разных людей. У...,HS1-30,,,Heritage 1,,,,,,,,NaT,,russian,L1
2,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_24,"В прошлем году, я провела четыре месяцев в Лон...",HS1-24,,,Heritage 1,,,,,,,,NaT,,russian,L1


In [65]:
data.to_json('actr.json', orient='records', force_ascii=False, indent=4)