## Using the RNC to look at preps in certain windows of verbs

In [2]:
import random

In [1]:
from corus import load_morphoru_rnc

path = 'RNCgoldInUD_Morpho.conll'
records = load_morphoru_rnc(path)
rnccorpus = []
for record in records:
    rnccorpus.append(record)

In [3]:
len(rnccorpus)

98892

In [8]:
# example of data
print(random.choice(rnccorpus))

MorphoSent(tokens=[MorphoToken(text='2', lemma='2', pos='NUM', feats={'NumForm': 'Digit'}, feats2={}), MorphoToken(text=')', lemma=')', pos='PUNCT', feats={}, feats2={}), MorphoToken(text='Хоть', lemma='хоть', pos='CONJ', feats={}, feats2={}), MorphoToken(text='и', lemma='и', pos='CONJ', feats={}, feats2={}), MorphoToken(text='на', lemma='на', pos='ADP', feats={}, feats2={}), MorphoToken(text='данном', lemma='данный', pos='ADJ', feats={'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing', 'Variant': 'Full'}, feats2={}), MorphoToken(text='этапе', lemma='этап', pos='NOUN', feats={'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'}, feats2={}), MorphoToken(text='этот', lemma='этот', pos='DET', feats={'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, feats2={}), MorphoToken(text='сюжет', lemma='сюжет', pos='NOUN', feats={'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}, feats2={}), MorphoToken(text='действительно', lemma='действительно', pos='H', feats

### script for parsing corpus

In [16]:
sample = rnccorpus
postags = set()
for sent in sample:
    for token in sent.tokens:
        postags.add(token.pos)
postags

{'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'H',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PUNCT',
 'SYM',
 'VERB',
 'X'}

In [41]:
def build_sentence(tokens, currentindex, adpindex):
    sentence = ''
    for i, token in enumerate(tokens):
        if token.pos == 'PUNCT':
            space = ''
        else:
            space = ' '
        if i in [currentindex, adpindex]:
            text = token.text.upper()
        else:
            text = token.text
        if text == None:
            text = ''
        sentence = sentence + space + text
    return sentence.strip()

sample = rnccorpus
windowList = []
for window in range(1, 5):
    dataDict = dict()
    for sent in sample:
        currentindex = 0
        for token in sent.tokens:
            if token.pos == 'VERB':
                verblemma = token.lemma
                if verblemma not in dataDict:
                    dataDict[verblemma] = dict()
                    dataDict[verblemma]['counts'] = dict()
                    dataDict[verblemma]['sentences'] = dict() 
                for shift in range(-window, window+1):
                    index = currentindex + shift
                    if index >= 0 and index < len(sent.tokens):
                        thistoken = sent.tokens[index]
                        if thistoken.pos == 'ADP':
                            adplemma = thistoken.lemma
                            if adplemma not in dataDict[verblemma]['counts']:                               
                                dataDict[verblemma]['counts'][adplemma] = 0
                                dataDict[verblemma]['sentences'][adplemma] = []
                            dataDict[verblemma]['counts'][adplemma] += 1
                            formattedsent = build_sentence(sent.tokens, currentindex, index)
                            if formattedsent not in dataDict[verblemma]['sentences'][adplemma]:
                                dataDict[verblemma]['sentences'][adplemma].append(formattedsent)
            currentindex += 1
    print(f'window size of {window} complete...')
    windowList.append(dataDict)

window size of 1 complete...
window size of 2 complete...
window size of 3 complete...
window size of 4 complete...


In [42]:
# removing uncommon verbs (<10 occurrences)
uncommonVerbs = set()
for windowDict in windowList:
    for verb in windowDict:
        totalcount = 0
        countDict = windowDict[verb]['counts']
        for prep in windowDict[verb]['counts']:
            totalcount += windowDict[verb]['counts'][prep]
        if totalcount < 10:
            uncommonVerbs.add(verb)
    for verbtoremove in uncommonVerbs:
        del windowDict[verbtoremove]

In [43]:
# number of verbs remaining
len(windowList[0])

433

In [44]:
okayverbs = list(windowList[0].keys())
print(okayverbs[:10])

['сказать', 'найти', 'рассказать', 'находить', 'знать', 'плакать', 'смеяться', 'рассказывать', 'ввести', 'вводить']


In [11]:
pwd

'/Users/paigelee/Desktop/spring2021/clancy/verbinvestigation/data'

In [12]:
# writing dictionary to json

import json
filepath = '../data.json'

with open(filepath, 'w', encoding='utf8') as json_file:
    json.dump(windowList, json_file, ensure_ascii=False)
    
print(filepath,'written successfully.')

../data.json written successfully.


## breaking down by cases as well

In [45]:
def build_sentence(tokens, currentindex, adpindex):
    sentence = ''
    for i, token in enumerate(tokens):
        if token.pos == 'PUNCT':
            space = ''
        else:
            space = ' '
        if i in [currentindex, adpindex]:
            text = token.text.upper()
        else:
            text = token.text
        if text == None:
            text = ''
        sentence = sentence + space + text
    return sentence.strip()

# sample = random.sample(rnccorpus,50)
sample = rnccorpus
windowList = []
for window in range(-4, 5):
    dataDict = dict()
    for sent in sample:
        currentindex = 0
        for token in sent.tokens:
            if token.pos == 'VERB':
                verblemma = token.lemma
                if verblemma not in okayverbs:
                    break
                if verblemma not in dataDict:
                    dataDict[verblemma] = dict()
                    dataDict[verblemma]['counts'] = dict()
                    dataDict[verblemma]['sentences'] = dict()
#                 if window < 0:
#                     start = window
#                     end = 0
#                 else:
#                     start = 0
#                     end = window
#                 for shift in range(start, end):
                index = currentindex + window
                if index >= 0 and index < len(sent.tokens):
                    thistoken = sent.tokens[index]
                    if thistoken.pos == 'ADP':
                        adplemma = thistoken.lemma
                        if adplemma not in dataDict[verblemma]['counts']:                               
                            dataDict[verblemma]['counts'][adplemma] = 0
                            dataDict[verblemma]['sentences'][adplemma] = []
                        dataDict[verblemma]['counts'][adplemma] += 1
                        formattedsent = build_sentence(sent.tokens, currentindex, index)
                        if formattedsent not in dataDict[verblemma]['sentences'][adplemma]:
                            dataDict[verblemma]['sentences'][adplemma].append(formattedsent)
            currentindex += 1
    print(f'window size of {window} complete...')
    windowList.append(dataDict)

window size of -4 complete...
window size of -3 complete...
window size of -2 complete...
window size of -1 complete...
window size of 0 complete...
window size of 1 complete...
window size of 2 complete...
window size of 3 complete...
window size of 4 complete...


In [35]:
pwd

'/Users/paigelee/Desktop/spring2021/clancy/verbinvestigation/data'

In [47]:
# writing dictionary to json

import json
filepath = '../individual_window_data1.json'

with open(filepath, 'w', encoding='utf8') as json_file:
    json.dump(windowList, json_file, ensure_ascii=False)
    
print(filepath,'written successfully.')

../individual_window_data1.json written successfully.


In [None]:
# rest is just exploratory parsing...

In [263]:
allpreps = set()
for windowdict in windowList:
    for verb in windowdict:
        countdict = windowdict[verb]['counts']
        for prep in countdict:
            allpreps.add(prep)

In [264]:
len(allpreps)

94

In [267]:
for prep in sorted(list(allpreps)):
    print(prep)

У
без
безо
благодаря
близ
в
вблизи
ввиду
вдогонку
вдоль
взамен
включая
вместо
вне
внизу
внутри
внутрь
во
возле
вокруг
вопреки
вперед
впереди
вроде
вслед
вследствие
высокий
выше
для
до
за
из
из-за
из-под
изо
исключая
к
касательно
ко
кроме
меж
между
мимо
на
над
надо
накануне
наперекор
наподобие
напротив
насчет
о
об
обо
около
от
относительно
ото
перед
передо
плюс
по
поверх
под
подле
подобно
позади
помимо
поперек
посеред
посередине
после
посреди
посредине
посредством
превыше
прежде
при
про
против
путем
ради
с
сверх
свыше
сзади
сквозь
со
согласно
спустя
среди
типа
у
через


In [159]:
import json
# make json file
filepath = 'verblabels.json'

with open(filepath, 'w', encoding='utf8') as json_file:
    json.dump(full_json_list, json_file, ensure_ascii=False)
    
print(filepath,'written successfully.')

[{'verb': 'нравиться', 'countdata': {}},
 {'verb': 'сказать', 'countdata': {'при': 1, 'в': 1}},
 {'verb': 'подавать', 'countdata': {'при': 1}},
 {'verb': 'принести', 'countdata': {'из': 1}},
 {'verb': 'хотеться', 'countdata': {'в': 1, 'об': 2}},
 {'verb': 'нести', 'countdata': {'у': 1}},
 {'verb': 'присоединять', 'countdata': {'к': 1}},
 {'verb': 'идти',
  'countdata': {'у': 1, 'на': 1, 'о': 2, 'в': 1, 'по': 1, 'без': 1, 'за': 1}},
 {'verb': 'заниматься', 'countdata': {}},
 {'verb': 'мочь', 'countdata': {'о': 1, 'на': 2, 'в': 3, 'к': 1, 'по': 1}},
 {'verb': 'закончить', 'countdata': {}},
 {'verb': 'расхохотаться', 'countdata': {}},
 {'verb': 'определить', 'countdata': {}},
 {'verb': 'иметь', 'countdata': {'к': 1, 'в': 1}},
 {'verb': 'проводить', 'countdata': {'до': 1}},
 {'verb': 'поцеловать', 'countdata': {'на': 1}},
 {'verb': 'быть',
  'countdata': {'у': 12,
   'по': 4,
   'на': 7,
   'для': 7,
   'без': 1,
   'за': 2,
   'в': 12,
   'о': 2,
   'над': 1,
   'между': 1,
   'при': 1,
 