# Spell-checking with Yandex Speller & Annotation with UDpipe

In [1]:
# !pip install pyaspeller

In [2]:
import os
import re
import json

import pandas as pd
import numpy as np
import scipy.stats as stats
from tqdm.auto import tqdm

from conllu import parse
from pyaspeller import YandexSpeller
speller = YandexSpeller()

ENG_MODEL = 'english-ewt-ud-2.12-230717'
RUS_MODEL = 'russian-syntagrus-ud-2.12-230717'
RANDOM_STATE = 42

pd.options.mode.chained_assignment = None

In [3]:
if not os.path.exists('_spell_checked'):
    os.mkdir('_spell_checked')

if not os.path.exists('annotate'):
    os.mkdir('annotate')

if not os.path.exists('_annotated'):
    os.mkdir('_annotated')

In [4]:
def load_clean_corpus_from_json(path):
    df = pd.read_json(path, orient='records')
    df.dropna(subset='text', inplace=True)
    print(df.shape)
    return df

In [5]:
def spellcheck_corpus(df, out_file):
    spelled = []
    for text in tqdm(df['text'].tolist()):
        try:
            spelled.append(speller.spelled(text))
        except EncodingError:
            spelled.append(text)
    df['spell_checked'] = spelled
    df.to_json(out_file, orient='records', force_ascii=False, indent=4)
    return df

In [6]:
def create_raw_files(texts, corpus_name, model):

    ud_commands = []
    out_files = []

    if not os.path.exists(f'annotate/{corpus_name}'):
        os.mkdir(f'annotate/{corpus_name}')

    for i, text in enumerate(texts):
        pad = len(str(len(texts)))
        file_name = f'{corpus_name}/{corpus_name}{str(i).zfill(pad)}'
        with open(f'annotate/{file_name}.txt', 'w', encoding='utf-8') as f:
            f.write(text)
        command = f'curl -F data=@{file_name}.txt -F model="{model}" -F tokenizer="{model}" -F tagger="{model}" -F parser="{model}" -F output=conllu http://lindat.mff.cuni.cz/services/udpipe/api/process > {file_name}.json'
        ud_commands.append(command)
        out_files.append(f'{file_name}.json')

    bat_file = f'annotate/{corpus_name}.bat'
    with open(bat_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(ud_commands))

    return [os.path.join('annotate/', file) for file in out_files]

In [7]:
def check_annotated_files(corpus_name, out_files):
    conllus = [os.path.join(f'annotate/{corpus_name}/', file)
               for file in os.listdir(f'annotate/{corpus_name}')
               if file.endswith('.json')]
    try:
        assert len(conllus) == len(out_files)
        print('All OK!')
    except AssertionError:
        print(set(out_files).difference(set(conllus)))

    return conllus

In [8]:
def get_annotations(df, conllus, out_file):
    annos = []
    for file in conllus:
        with open(file, 'r', encoding='utf-8') as f:
            annos.append(json.load(f)['result'])
    df['annotated'] = annos
    df.to_json(out_file, orient='records', force_ascii=False, indent=4)
    return df

In [9]:
def combine_columns(*cols):
    for col in cols:
        if pd.notnull(col):
            return col
    return np.nan

In [10]:
def get_len_words(conl):
    text_len = 0
    for sent in conl:
        text_len += len([word for word in sent if word['upos'] not in {'PUNCT', 'SYM', '_'}])
    return text_len

### LOCNESS
* All texts are used
* No spell-checking
* Annotation DONE

In [9]:
# locness = load_clean_corpus_from_json('_clean/locness.json')

In [10]:
# out_files = create_raw_files(locness['text'].tolist(), corpus_name='locness', model=ENG_MODEL)

In [11]:
# RUN .bat with Command Prompt, then the code below
# conllus = check_annotated_files('locness', out_files)

In [12]:
# locness = get_annotations(locness, conllus, '_annotated/locness.json')

In [13]:
locness = pd.read_json('_annotated/locness.json')
locness.head(3)

Unnamed: 0,filename,code,text,language,speaker_type,dialect,topic,task_type,level,location,annotated
0,locness\alevels1.txt,Transport 01,The basic dilema facing the UK's rail and road...,english,L1,british,Transport,argumentative,a-level,,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,locness\alevels1.txt,Transport 02,Traffic jams are becoming larger and more freq...,english,L1,british,Transport,argumentative,a-level,,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,locness\alevels1.txt,Transport 03,As transport has advanced over the past 200 ye...,english,L1,british,Transport,argumentative,a-level,,"# generator = UDPipe 2, https://lindat.mff.cun..."


## RLC
* All texts are used
* Spell-checking is performed
* Annotation DONE

In [9]:
# rlc = load_clean_corpus_from_json('_clean/rlc.json')

(2002, 13)


In [10]:
# rlc = spellcheck_corpus(rlc, '_spell_checked/rlc.json')
rlc = pd.read_json('_spell_checked/rlc.json')

In [11]:
out_files = create_raw_files(rlc['spell_checked'].tolist(), corpus_name='rlc', model=RUS_MODEL)

In [14]:
# RUN .bat with Command Prompt, then the code below
conllus = check_annotated_files('rlc', out_files)

All OK!


In [15]:
# rlc = get_annotations(rlc, conllus, '_annotated/rlc.json')

In [16]:
rlc = pd.read_json('_annotated/rlc.json')
rlc.head(3)

Unnamed: 0,document_id,text,corrected,status,subcorpus,native,language_background,level,words,sentences,language,speaker_type,prompt,spell_checked,annotated
0,1,Загрязнение тяжелыми металлами Дальнегорского ...,Загрязнение тяжелыми металлами Дальнегорского ...,,RULEC,eng,HL,AM,431,22,russian,L2,,Загрязнение тяжелыми металлами Дальнегорского ...,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,3,Директору магазина « Адидас» М. И. Васильченко...,Директору магазина « Адидас» М. И. Васильченко...,,RULEC,eng,HL,AM,245,17,russian,L2,,Директору магазина « Адидас» М. И. Васильченко...,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,5,"Вывод. Спасибо, ребята, за хорошие ответы. Я м...","Вывод. Спасибо, ребята, за хорошие ответы. Я м...",,RULEC,eng,FL,AM,472,22,russian,L2,,"Вывод. Спасибо, ребята, за хорошие ответы. Я м...","# generator = UDPipe 2, https://lindat.mff.cun..."


## RULEC
* Only texts of type "paragraph" are used
* Spell-checking is perfomed
* Annotation DONE

In [20]:
# rulec = load_clean_corpus_from_json('_clean/rulec.json')

In [21]:
def extract_task(text, task):
    tasks = []
    if task:
        tasks.append(task)
    tasks.extend(re.findall(r'<[^a-z]+?>', text))
    for task in tasks:
        text = text.replace(task.strip(), '')
    text = re.sub(r'<.+?>', '', text)
    return text.strip(), '\n'.join(tasks)

In [22]:
def extract_title(text):
    if re.match(r'.{1,85}[А-ЯA-Zа-яa-zЁё»)]\n', text):
        parts = text.split('\n')
        return parts[0], '\n'.join(parts[1:])
    return np.nan, text

In [23]:
# rulec = rulec[rulec['text type'] == 'paragraph']  # filter texts
# rulec[['text', 'task']] = rulec.apply(lambda x: extract_task(x['text'], x['task']),
#                                       axis=1, result_type='expand')
# rulec[['title', 'text']] = rulec.apply(lambda x: extract_title(x['text']),
#                                        axis=1, result_type='expand')
# print(rulec.shape)

In [24]:
# rulec = spellcheck_corpus(rulec, '_spell_checked/rulec.json')
# rulec = pd.read_json('_spell_checked/rulec.json')

In [25]:
# out_files = create_raw_files(rulec['spell_checked'].tolist(), corpus_name='rulec', model=RUS_MODEL)

In [26]:
# RUN .bat with Command Prompt, then the code below
# conllus = check_annotated_files('rulec', out_files)

In [27]:
# rulec = get_annotations(rulec, conllus, '_annotated/rulec.json')

In [28]:
rulec = pd.read_json('_annotated/rulec.json')
rulec.head(3)

Unnamed: 0,filename,text,student,gender,speaker_type,level,year,course,week,text type,function,time,mode,language,task,title,spell_checked,annotated
0,Anna_HL_2009-2010_Week_19_2_paragraph+_descrip...,"Торговля людьми- популярная, широко обсуждаема...",Anna,f,HL,AL,2009-2010,Russian In The Major,192,paragraph,description,non-timed,individual,russian,<Представьте термин из области вашей специаль...,Торговля людьми,"Торговля людьми- популярная, широко обсуждаема...","# generator = UDPipe 2, https://lindat.mff.cun..."
1,Anna_HL_AL_2009-2010_Week_12_1_paragraph_summa...,Автор статьи излагает положительные и отрицате...,Anna,f,HL,AL,2009-2010,American Studies,121,paragraph,summary,non-timed,individual,russian,,,Автор статьи излагает положительные и отрицате...,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,Anna_HL_AL_2009-2010_Week_12_2_paragraph_suppo...,"Я считаю, что Сталин любил этот стил архитекту...",Anna,f,HL,AL,2009-2010,European Studies,122,paragraph,supported opinion,"timed, 10 min",individual,russian,,,"Я считаю, что Сталин любил этот стиль архитект...","# generator = UDPipe 2, https://lindat.mff.cun..."


## REALEC
* Only texts with a specified task (graph/essay) are used
* Only texts with either a grade or a CEFR level are used
* Spell-checking is performed
* Annotation DONE

In [29]:
# realec = load_clean_corpus_from_json('_clean/realec.json')

In [30]:
# filter texts
# realec = realec[((realec['mark'].notnull()) | (realec['CEFR_level'].notnull())) &
#                 (realec['text_type'].notnull())]
# print(realec.shape)

In [31]:
# realec = spellcheck_corpus(realec, '_spell_checked/realec.json')
# realec = pd.read_json('_spell_checked/realec.json')

In [32]:
# out_files = create_raw_files(realec['spell_checked'].tolist(), corpus_name='realec', model=ENG_MODEL)

In [33]:
# RUN .bat with Command Prompt, then the code below
# conllus = check_annotated_files('realec', out_files)

In [34]:
# realec = get_annotations(realec, conllus, '_annotated/realec.json')

In [35]:
realec = pd.read_json('_annotated/realec.json')
realec.head(3)

Unnamed: 0,filename,annotation,text,sex,mark,study_year,date,ielts,work_type,text_type,ann_checked,CEFR_level,year,essay_title,language,speaker_type,spell_checked,annotated
0,esl_00011.ann,T3\tAbsence_comp_sent 279 283\tdeal\n#3\tAnnot...,This episode is about a very interesting case ...,f,80,4.0,2016-10-10,1,exam,opinion essay,,,,,english,L2,Tus episode is about a very interesting case i...,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,AAl_10_1.ann,T1\tAgreement_errors 14 23\tvisualize\nT4\tRed...,Given diagram visualize the proportion of popu...,m,55,2.0,2014-03-28,1,exam,graph description,,,,,english,L2,Given diagram visualize the proportion of popu...,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,AAl_10_2.ann,T1\tSpelling 389 391\tan\nT2\tSpelling 407 410...,I strongly disagree with the given assumption ...,m,65,2.0,2014-03-28,1,exam,opinion essay,,,,,english,L2,I strongly disagree with the given assumption ...,"# generator = UDPipe 2, https://lindat.mff.cun..."


## ACTR
* Group work is excluded
* Spell-checking is performed
* Annotation DONE

In [36]:
# actr = load_clean_corpus_from_json('_clean/actr.json')

In [37]:
def extract_actr_title(text):
    parts = text.split('\n')
    first = parts[0]
    if len(first) < 100 and not re.search('^-?.+?[,.]$', first):
        return first, '\n'.join(parts[1:])
    elif len(first) < 46 and not first.endswith(','):
        return first, '\n'.join(parts[1:])
    return np.nan, text

In [38]:
# actr[['title', 'text']] = actr.apply(lambda x: extract_actr_title(x['text']),
#                                      axis=1, result_type='expand')

In [39]:
# actr = spellcheck_corpus(actr, '_spell_checked/actr.json')
# actr = pd.read_json('_spell_checked/actr.json')

In [40]:
# out_files = create_raw_files(actr['spell_checked'].tolist(), corpus_name='actr', model=RUS_MODEL)

In [41]:
# RUN .bat with Command Prompt, then the code below
# conllus = check_annotated_files('actr', out_files)

In [42]:
# actr = get_annotations(actr, conllus, '_annotated/actr.json')

In [43]:
actr = pd.read_json('_annotated/actr.json')
actr.head(3)

Unnamed: 0,path,raw_name,text,speaker_id,country,gender,level,age,prompt,institution,...,native_lang,name,school_grade,dob,date,language,speaker_type,title,spell_checked,annotated
0,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_50,Прокатившись по многим городам и странам за по...,HS1-50,,,Heritage 1,,,,...,,,,,NaT,russian,L1,"""Место Любимое Моё""",Прокатившись по многим городам и странам за по...,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_30,В моей жизни я встречала много разных людей. У...,HS1-30,,,Heritage 1,,,,...,,,,,NaT,russian,L1,,В моей жизни я встречала много разных людей. У...,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,actr\Zhenyas data\Essay Contest Heritage 1\Num...,hs1_24,"В прошлем году, я провела четыре месяцев в Лон...",HS1-24,,,Heritage 1,,,,...,,,,,NaT,russian,L1,,"В прошлом году, я провела четыре месяцев в Лон...","# generator = UDPipe 2, https://lindat.mff.cun..."


## CoRST / КРУТ
* Only texts of certain types are used
* No spell-checking
* Annotation DONE

In [44]:
# corst = load_clean_corpus_from_json('_clean/corst.json')

In [45]:
# filter texts
# corst = corst[corst['genre'].isin({'аннотация', 'аннотация проекта', 'эссе',
#                                    'коммерческое предложение', 'семестровая работа'})]
# print(corst.shape)

In [46]:
# out_files = create_raw_files(corst['text'].tolist(), corpus_name='corst', model=RUS_MODEL)

In [47]:
# RUN .bat with Command Prompt, then the code below
# conllus = check_annotated_files('corst', out_files)

In [48]:
# corst = get_annotations(corst, conllus, '_annotated/corst.json')

In [49]:
corst = pd.read_json('_annotated/corst.json')
corst.head(3)

Unnamed: 0,doc_id,text,created,author,date1,date2,genre,gender,major,course,term,module,domain,university,words,sentences,language,speaker_type,annotated
0,2,задание№ 1 В самом раннем детстве меня привлек...,1435876038000,Судакова Мария,2013.0,2014.0,эссе,f,Дизайн,2 курс бак,2.0,4.0,разнородная тематика,,518,42,russian,L1,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,5,Многие называют Мальту игрушечной страной. И п...,1435876038000,Бардина Анастасия,2013.0,2014.0,эссе,f,Дизайн,2 курс бак,2.0,4.0,разнородная тематика,,291,23,russian,L1,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,6,"Так , что же такое дизайн вообще и дизайн сред...",1435876038000,Калашян Юлия,2013.0,2014.0,эссе,f,Дизайн,2 курс бак,2.0,4.0,разнородная тематика,,535,24,russian,L1,"# generator = UDPipe 2, https://lindat.mff.cun..."


## Combining dataframes to estimate length distribution

In [9]:
def parse_whole_corpus(texts):
    return [parse(text) for text in tqdm(texts)]

In [10]:
def get_len_words(conl):
    text_len = 0
    for sent in conl:
        text_len += len(sent)
    return text_len

In [11]:
corpora = [os.path.join('_annotated/', file) for file in os.listdir('_annotated')
           if 'reddit' not in file and 'pikabu' not in file]
corpora

['_annotated/actr.json',
 '_annotated/corst.json',
 '_annotated/locness.json',
 '_annotated/realec.json',
 '_annotated/rlc.json',
 '_annotated/rulec.json']

In [12]:
corp_dfs = []
for file in tqdm(corpora):
    corpus = pd.read_json(file, orient='records')
    corpus['corpus'], _ = os.path.splitext(os.path.basename(file))
    corp_dfs.append(corpus)

  0%|          | 0/6 [00:00<?, ?it/s]

In [13]:
data = pd.concat(corp_dfs)
data.dropna(how='all', axis=0, inplace=True)
data.dropna(how='all', axis=1, inplace=True)
data.fillna(np.nan, inplace=True)
print(data.shape)

(9403, 63)


In [14]:
data = data[['corpus', 'subcorpus', 'language', 'speaker_type', 'annotated']]
print(data.shape)
data.head(3)

(9403, 5)


Unnamed: 0,corpus,subcorpus,language,speaker_type,annotated
0,actr,,russian,L1,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,actr,,russian,L1,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,actr,,russian,L1,"# generator = UDPipe 2, https://lindat.mff.cun..."


In [15]:
data['conllu'] = parse_whole_corpus(data['annotated'].tolist())

  0%|          | 0/9403 [00:00<?, ?it/s]

In [16]:
data['num_words'] = data['conllu'].map(get_len_words)
data['num_words'].describe()

count    9403.000000
mean      351.011486
std       377.599445
min         0.000000
25%       165.000000
50%       254.000000
75%       380.000000
max      7931.000000
Name: num_words, dtype: float64

### Pikabu
* 20k texts with similar length distribution
* No spell-checking
* Annotation DONE

In [61]:
chunks = [chunk for chunk in tqdm(
    pd.read_csv('_clean/pikabu_large.tsv', sep='\t', chunksize=500000))]
pikabu = pd.concat(chunks)
print(pikabu.shape)
pikabu.head(3)

0it [00:00, ?it/s]

(3005828, 9)


Unnamed: 0,doc_id,url,title,text,date,author_id,pluses,minuses,num_words
0,6991642,https://pikabu.ru/story/chto_mozhno_kupit_v_ki...,Что можно купить в Китае за цену нового iPhone...,Осенью в России стартовали продажи очередной м...,1571221527,2900955,9,13,1029
1,7178566,https://pikabu.ru/story/posledniy_ostavshiysya...,Последний оставшийся в живых освободитель Осве...,В канун 75-летия освобождения концлагеря и V В...,1579586602,1723707,1498,159,1690
2,7021067,https://pikabu.ru/story/zima_v_tyumen_prishla_...,Зима в Тюмень пришла.,И в честь этого я сочинил свой первый пирожок....,1572537738,2473821,517,64,18


In [62]:
pikabu['corpus'] = 'pikabu'
pikabu['language'] = 'russian'
pikabu['speaker_type'] = 'L1'
pikabu.drop(columns=['pluses', 'minuses', 'url'], inplace=True)

### Reddit
* 20k texts with similar length distribution
* No spell-checking
* Annotation DONE

In [63]:
chunks = [chunk for chunk in tqdm(
    pd.read_csv('_clean/reddit_large.tsv', sep='\t', chunksize=500000))]
reddit = pd.concat(chunks)
print(reddit.shape)
reddit.head(3)

0it [00:00, ?it/s]

(1763029, 9)


Unnamed: 0,subreddit_id,content_len,summary_len,text,title,doc_id,prompt,subreddit,author_id
0,t5_2qore,404,7,"You are talking about the Charsi imbue, right?...",D2 help?,c6acxvc,Class only items dropped from high-lvl monsters.,Diablo,NuffZetPand0ra
1,t5_2qore,96,23,Change out force armor for pinpoint barrier (+...,Initial Impressions of CM Wizards in 1.0.5,c6dvxzf,"CC, LoH, and APoC before DPS (in my opinion). ...",Diablo,zlevine
2,t5_2qore,380,46,Guess I'll throw my .02 in here. Spec(Spectral...,Initial Impressions of CM Wizards in 1.0.5,c6dm81w,Spectral Blades - Deep Cuts still procs CM fro...,Diablo,Nekenieh


In [64]:
reddit['corpus'] = 'reddit'
reddit['language'] = 'english'
reddit['speaker_type'] = 'L1'
reddit.rename(columns={'content_len': 'num_words'}, inplace=True)
reddit.drop(columns=['summary_len', 'subreddit', 'subreddit_id'], inplace=True)

### Get length distribution & sample from datasets
We round nuw_words up and treat each value as a separate class, then sample 20,000 texts from both Pikabu and Reddit.

In [65]:
def round_to_classes(x):
    if x < 500:
        return 5 * round(x/5)
    elif x < 1000:
        return 10 * round(x/10)
    else:
        return 25 * round(x/25)

In [84]:
data['num_class'] = data['num_words'].map(round_to_classes)
classes = data['num_class'].value_counts().reset_index().to_dict(orient='records')
classes = {row['num_class']: row['count'] for row in classes}

In [85]:
pikabu['num_class'] = pikabu['num_words'].map(round_to_classes)
reddit['num_class'] = reddit['num_words'].map(round_to_classes)

In [86]:
psample = pikabu.sample(20000, random_state=RANDOM_STATE,
                        weights=pikabu['num_class'].map(pd.Series(classes)/pikabu['num_class'].value_counts()))
rsample = reddit.sample(20000, random_state=RANDOM_STATE,
                        weights=reddit['num_class'].map(pd.Series(classes)/reddit['num_class'].value_counts()))

In [87]:
# psample.to_json('_clean/pikabu.json', orient='records', force_ascii=False, indent=4)
# rsample.to_json('_clean/reddit.json', orient='records', force_ascii=False, indent=4)

### Annotate Pikabu & Reddit

In [9]:
psample = pd.read_json('_clean/pikabu.json', orient='records')
rsample = pd.read_json('_clean/reddit.json', orient='records')

In [10]:
out_files = create_raw_files(psample['text'].tolist(), corpus_name='pikabu', model=RUS_MODEL)

In [11]:
# RUN .bat with Command Prompt, then the code below
conllus = check_annotated_files('pikabu', out_files)

All OK!


In [12]:
# psample = get_annotations(psample, conllus, '_annotated/pikabu.json')

In [13]:
psample = pd.read_json('_annotated/pikabu.json')
psample.head(3)

Unnamed: 0,doc_id,title,text,date,author_id,num_words,corpus,language,speaker_type,num_class,annotated
0,2512869,Герман Обухов: Сбитая Россия!!!,В августе 1983 года во время правления\nЮрия А...,2014-07-26 06:33:36,681246,926,pikabu,russian,L1,930,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,3815335,Почему инопланетяне не выходят с нами на контакт,"На дороге лежит червяк, и вы проходите мимо не...",2015-11-30 08:22:37,1128238,201,pikabu,russian,L1,200,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,5530814,Ремарка про молодое поколение,Сделаю пост в качестве места тыка для тех кто ...,2017-12-03 17:03:41,704721,264,pikabu,russian,L1,265,"# generator = UDPipe 2, https://lindat.mff.cun..."


In [14]:
out_files = create_raw_files(rsample['text'].tolist(), corpus_name='reddit', model=ENG_MODEL)

In [15]:
# RUN .bat with Command Prompt, then the code below
conllus = check_annotated_files('reddit', out_files)

All OK!


In [16]:
# rsample = get_annotations(rsample, conllus, '_annotated/reddit.json')

In [17]:
rsample = pd.read_json('_annotated/reddit.json')
rsample.head(3)

Unnamed: 0,num_words,text,title,doc_id,prompt,author_id,corpus,language,speaker_type,num_class,annotated
0,450,If you've visited the comments on any recent O...,The Ouya-hate phenomenon: one theory.,t3_1a8mz4,Ouya has received disproportionate criticism a...,[deleted],reddit,english,L1,450,"# generator = UDPipe 2, https://lindat.mff.cun..."
1,256,"Hi everyone, \n I run a small web-site – AGRO...",Request for volunteer writers for agricultural...,t3_1rint4,We run a website for farmers in developing cou...,AgroamTech,reddit,english,L1,255,"# generator = UDPipe 2, https://lindat.mff.cun..."
2,216,"Firstly, I'd like to say how grateful I am to ...",[meta] Posts in this sub are starting to breac...,t3_220e9l,Plenty of videos do not seem to relate to Lit....,Wallstonecraft,reddit,english,L1,215,"# generator = UDPipe 2, https://lindat.mff.cun..."


## Combining datasets

In [11]:
corpora = [os.path.join('_annotated/', file) for file in os.listdir('_annotated')]
corpora

['_annotated/actr.json',
 '_annotated/corst.json',
 '_annotated/locness.json',
 '_annotated/pikabu.json',
 '_annotated/realec.json',
 '_annotated/reddit.json',
 '_annotated/rlc.json',
 '_annotated/rulec.json']

In [12]:
corp_dfs = []
for file in tqdm(corpora):
    corpus = pd.read_json(file, orient='records')
    corpus['corpus'], _ = os.path.splitext(os.path.basename(file))
    corp_dfs.append(corpus)

  0%|          | 0/8 [00:00<?, ?it/s]

In [13]:
data = pd.concat(corp_dfs)
data.dropna(how='all', axis=0, inplace=True)
data.dropna(how='all', axis=1, inplace=True)
data.fillna(np.nan, inplace=True)
print(data.shape)

(49401, 66)


In [14]:
data.rename(columns={'annotation': 'error_annotation',
                     'corrected': 'corrected_text',
                     'time': 'timed'}, inplace=True)
data['study_year2'] = data['course']
data.loc[~data['study_year2'].isin({'2 курс бак', '1 курс бак', '4 курс бак',
                                    '2 курс спец', '3 курс бак'}), 'study_year2'] = np.nan
data.loc[data['study_year2'].notnull(), 'course'] = np.nan
data = data[~data['mode'].isin({'pairs', 'group'})]
data.loc[data['subcorpus'] == 'RULEC', 'corpus'] = 'rulec'

data['gender'] = data.apply(lambda x: combine_columns(x['gender'], x['sex']), axis=1)
data['L1'] = data.apply(lambda x: combine_columns(x['native_lang'], x['native']), axis=1)
data['author_name'] = data.apply(lambda x: combine_columns(x['name'], x['author'], x['student']), axis=1)
data['institution'] = data.apply(lambda x: combine_columns(x['institution'], x['location'], x['university']), axis=1)
data['doc_id'] = data.apply(lambda x: combine_columns(x['doc_id'], x['document_id']), axis=1)
data['author_id'] = data.apply(lambda x: combine_columns(x['speaker_id'], x['code']), axis=1)
data['prompt'] = data.apply(lambda x: combine_columns(x['title'], x['task'], x['topic'], x['prompt'], x['domain']), axis=1)
data['text_type'] = data.apply(lambda x: combine_columns(x['text_type'], x['text type'], x['genre'], x['task_type']), axis=1)
data['date'] = data.apply(lambda x: combine_columns(x['date'], x['date1'], x['date2'], x['year']), axis=1)
data['study_year'] = data.apply(lambda x: combine_columns(x['study_year'], x['study_year2']), axis=1)
data['programme'] = data.apply(lambda x: combine_columns(x['major'], x['course']), axis=1)

In [15]:
data = data[['corpus', 'subcorpus', 'language', 'speaker_type', 'dialect',
             'language_background', 'text', 'spell_checked', 'annotated',
             'error_annotation', 'corrected_text', 'mark',
             'text_type', 'prompt', 'function', 'date', 'timed',
             'author_id', 'author_name', 'gender', 'age', 'L1', 'level',
             'institution', 'programme', 'study_year', 'term', 'module', 'week']]
print(data.shape)
data.head(3)

(49397, 29)


Unnamed: 0,corpus,subcorpus,language,speaker_type,dialect,language_background,text,spell_checked,annotated,error_annotation,...,gender,age,L1,level,institution,programme,study_year,term,module,week
0,actr,,russian,L1,,,Прокатившись по многим городам и странам за по...,Прокатившись по многим городам и странам за по...,"# generator = UDPipe 2, https://lindat.mff.cun...",,...,,,,Heritage 1,,,,,,
1,actr,,russian,L1,,,В моей жизни я встречала много разных людей. У...,В моей жизни я встречала много разных людей. У...,"# generator = UDPipe 2, https://lindat.mff.cun...",,...,,,,Heritage 1,,,,,,
2,actr,,russian,L1,,,"В прошлем году, я провела четыре месяцев в Лон...","В прошлом году, я провела четыре месяцев в Лон...","# generator = UDPipe 2, https://lindat.mff.cun...",,...,,,,Heritage 1,,,,,,


In [16]:
data.to_csv('SyntCompCorpus.tsv', sep='\t', index=False)