In [1]:
import random
from corus import load_morphoru_rnc

path = 'RNCgoldInUD_Morpho.conll'

### Load corpus

In [2]:
records = load_morphoru_rnc(path)

corpus = dict()
for rec in records:
    # check for category
    for item in rec.attrs:
        if '.xhtml' in item:
            category = item.replace('==> ','').replace('.xhtml <==','')
    corpus.setdefault(category, [])
    corpus[category].append(rec)

### Load Natasha tools for Dependency Parsing

In [3]:
from ipymarkup import show_dep_ascii_markup as show_markup
from razdel import sentenize, tokenize
from navec import Navec
from slovnet import Syntax
from slovnet import Morph

In [4]:
navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
syntax = Syntax.load('slovnet_syntax_news_v1.tar')
syntax.navec(navec)
morph = Morph.load('slovnet_morph_news_v1.tar', batch_size=4)
morph.navec(navec)
print('Navec and Slovnet Syntax and Morphology loaded.')

Navec and Slovnet Syntax and Morphology loaded.


### Testing the Natasha tools

In [5]:
text = 'Однако когда мы приблизились к ним и Мила уже протянула руку , чтобы погладить их , они стремительно сорвались с места и исчезли в чаще .'

In [6]:
chunk = []
for sent in sentenize(text):
    tokens = [_.text for _ in tokenize(sent.text)]
    chunk.append(tokens)
print(chunk)

dependencies = next(syntax.map(chunk))
morphologies = next(morph.map(chunk))

[['Однако', 'когда', 'мы', 'приблизились', 'к', 'ним', 'и', 'Мила', 'уже', 'протянула', 'руку', ',', 'чтобы', 'погладить', 'их', ',', 'они', 'стремительно', 'сорвались', 'с', 'места', 'и', 'исчезли', 'в', 'чаще', '.']]


In [7]:
# Convert CoNLL-style format to source, target indices
words, deps = [], []
for token in dependencies.tokens:
    words.append(token.text)
    source = int(token.head_id) - 1
    target = int(token.id) - 1
    if source > 0 and source != target:  # skip root, loops
        deps.append([source, target, token.rel])
show_markup(words, deps)

          ┌──────► Однако       advmod
          │   ┌──► когда        mark
          │   │ ┌► мы           nsubj
    ┌──►┌─│ ┌─└─└─ приблизились advcl
  ┌►│   │ │ │      к            case
  │ │   │ │ └────► ним          obl
  │ │   │ │     ┌► и            cc
  │ │   │ │   ┌►└─ Мила         nsubj
  │ │   │ │   │ ┌► уже          advmod
┌─│ │ ┌─│ └───└─└─ протянула    
│ │ │ │ │     └──► руку         obj
│ │ │ │ │     ┌──► ,            punct
│ │ │ │ │     │ ┌► чтобы        mark
│ │ │ │ │   ┌►└─└─ погладить    advcl
│ │ │ │ │   │ └──► их           obj
│ │ │ │ └──►│      ,            punct
│ │ │ │     │ ┌──► они          nsubj
│ │ │ │     │ │ ┌► стремительно advmod
│ │ └─│     └─└─└─ сорвались    
│ │   │     │   ┌► с            case
│ └───│     └──►└─ места        obl
│     │         ┌► и            cc
│     └──────►┌─└─ исчезли      conj
│             │ ┌► в            case
│             └►└─ чаще         obl
└────────────────► .            punct


In [8]:
### find the heads of different words. if there is a preposition in the head,
# then expect it to be cased for that reason

sent_len = len(dependencies.tokens)
for i in range(sent_len):

    pos = morphologies.tokens[i].pos
    itok = morphologies.tokens[i]

    if 'Case' in itok.feats and pos in ['NOUN','PRON']:
        print(itok.text)
        
        # check to see if this token is the head of some adp
        adpfound = False
        adpset = set()
        for j in range(sent_len):

            jtok = morphologies.tokens[j]
            headid = int(dependencies.tokens[j].head_id)
            jhead = morphologies.tokens[headid - 1]
            
            if itok == jhead:
                # check if adp
                if jtok.pos == 'ADP':
                    print('PREPOSITION:',jtok.text, itok.text)
                    adpfound = True
                    adpset.add((i-j,jtok.text))

        if not adpfound:
            print('none found')
        print()

мы
none found

ним
none found

руку
none found

их
none found

они
none found

места
PREPOSITION: к места
PREPOSITION: с места



#### Grouping prepositions

In [10]:
ppgrouper = {
    'в' : 'в/во',
    'во' : 'в/во',
    'с' : 'с/со',
    'со' : 'с/со',
    'к' : 'к/ко',
    'ко' : 'к/ко',
    'без' : 'без/безо',
    'безо' : 'без/безо',
    'о' : 'о/об/обо',
    'об' : 'о/об/обо',
    'обо' : 'о/об/обо',
}

In [11]:
ppgrouper['во']

'в/во'

# Parse corpus

In [12]:
ct = 0

lemmacases = dict()

cases = ['Nom','Acc','Gen','Loc','Dat','Ins']
    
for cat in corpus:
    print('parsing',cat)
    for rec in corpus[cat]:
        # get sentence
        sent = rec.tokens
        sent_text = ' '.join([t.text for t in rec.tokens if t.text != None])

        # parse sentence
        chunk = []
        for sent in sentenize(sent_text):
            tokens = [_.text for _ in tokenize(sent.text)]
            chunk.append(tokens)
        if chunk == [[]]:
            continue
        # generate dependencies and morphology
        dependencies = next(syntax.map(chunk))
        morphologies = next(morph.map(chunk))
        
        # loop over sent tokens
        for i, morph_token in enumerate(morphologies.tokens):
            bad_morph_token = False
            
            if 'Case' in morph_token.feats and morph_token.pos in ['PRON','NOUN'] and 'Number' in morph_token.feats:
                # get lemma accurately from rnc source
                for rec_token in rec.tokens:
                    if rec_token.text == morph_token.text:
                        lemma = rec_token.lemma
                        if 'Case' in rec_token.feats:
                            thiscase = rec_token.feats['Case'].upper()
                        else:
                            bad_morph_token = True
                        break
                if bad_morph_token:
                    continue
                    
                # set up dictionary
#                 thiscase = morph_token.feats['Case']
#                 if thiscase in ['Par','Voc']:
#                     continue
            
                lemmacases.setdefault(lemma,dict())
                for case in cases:
                    uppercase = case.upper()
                    lemmacases[lemma].setdefault(uppercase,dict())
                    for number in ['Sing','Plur']:
                        lemmacases[lemma][uppercase].setdefault(number,dict())
                        # set up noprep
                        lemmacases[lemma][uppercase][number].setdefault('NO_PREPOSITION',dict())
                        lemmacases[lemma][uppercase][number]['NO_PREPOSITION'].setdefault('count',0)
                        lemmacases[lemma][uppercase][number]['NO_PREPOSITION'].setdefault('examples',[])
                        lemmacases[lemma][uppercase][number]['NO_PREPOSITION'].setdefault('form','')
                # search for ADPositions
                itok = morphologies.tokens[i]

                # check to see if this token is the head of some adp
                adpfound = False
                adpset = set()
                for j in range(len(morphologies.tokens)):

                    jtok = morphologies.tokens[j]
                    headid = int(dependencies.tokens[j].head_id)
                    if headid == 0:
                        jhead = 'ROOT'
                    else:
                        jhead = morphologies.tokens[headid - 1]

                    if itok == jhead:
                        # check if adp
                        if jtok.pos == 'ADP':
                            prepphrase = f'{jtok.text.lower()} + {thiscase.upper()}'
                            adpfound = True
                            adpset.add((i-j, jtok.text))

                if not adpfound:
                    prepphrase = 'NO_PREPOSITION'
                else:
                    mindist, prepphrase = list(adpset)[0]
                    for distance, adptoken in adpset:
                        if distance < mindist:
                            mindist = distance
                            prepphrase = adptoken
                    prepphrase = prepphrase.lower()
                    if prepphrase in ppgrouper:
                        prepphrase = ppgrouper[prepphrase]
                        
                qty = morph_token.feats['Number']
                
                lemmacases[lemma][thiscase][qty].setdefault(prepphrase, dict())
                lemmacases[lemma][thiscase][qty][prepphrase].setdefault('examples',[])
                lemmacases[lemma][thiscase][qty][prepphrase].setdefault('count',0)

                lemmacases[lemma][thiscase][qty][prepphrase]['form'] = itok.text
                lemmacases[lemma][thiscase][qty][prepphrase]['examples'].append(sent_text)
                lemmacases[lemma][thiscase][qty][prepphrase]['count'] += 1
                
        ct += 1
        
print(f'Parsed all {ct} records in abridged RNC.')

parsing blogs
parsing fiction
parsing public
parsing science
parsing speech
Parsed all 98891 records in abridged RNC.


In [13]:
uppercases = [case.upper() for case in cases]
uppercases

['NOM', 'ACC', 'GEN', 'LOC', 'DAT', 'INS']

In [14]:
keyword = 'облако'

for case in uppercases:
    print(case)
    for number in ['Sing','Plur']:
        for pp in lemmacases[keyword][case][number]:
            print(pp, number, lemmacases[keyword][case][number][pp]['form'], lemmacases[keyword][case][number][pp]['count'])
        print()
    print()

NOM
NO_PREPOSITION Sing облако 1

NO_PREPOSITION Plur облака 7


ACC
NO_PREPOSITION Sing  0

NO_PREPOSITION Plur облака 2


GEN
NO_PREPOSITION Sing облака 1
из Sing облака 1

NO_PREPOSITION Plur облаков 7
из Plur облаков 1
до Plur облаков 1


LOC
NO_PREPOSITION Sing  0
на Sing облаке 1
в/во Sing облацех 1

NO_PREPOSITION Plur  0
в/во Plur облаках 2


DAT
NO_PREPOSITION Sing  0

NO_PREPOSITION Plur  0


INS
NO_PREPOSITION Sing  0

NO_PREPOSITION Plur облаками 1




### Convert to JSON form for radar charts

In [22]:
# find case/pps which do not seem relevant to include
numcount = dict()
for lemma in lemmacases:
    numcount[lemma] = dict()
    for case in ['ACC','NOM','INS','DAT','LOC','GEN']:
        numcount[lemma][case] = dict()
        for pp in set(list(lemmacases[lemma][case]['Sing'].keys())+list(lemmacases[lemma][case]['Plur'].keys())):
            numcount[lemma][case][pp] = 0
            for number in ['Sing','Plur']:
                if pp in lemmacases[lemma][case][number]:
                    numcount[lemma][case][pp] += lemmacases[lemma][case][number][pp]['count']

In [34]:
jsonlist = []

totals = dict()

for lemma in lemmacases:
    thisdict = dict()
    thisdict['label'] = lemma
    
    thisdict['data'] = {}
    
    totals[lemma] = dict()
    
    basic = dict()
    detailed = dict()
    
    examplecount = 0
    
    for number in ['Sing','Plur']:
        totals[lemma][number] = dict()
        
        basic[number] = dict()
#         basic[number]['numlabel'] = lemmacases[lemma]['Nom'][number]['NO_PREPOSITION']['form']
        basic[number]['data'] = []
        
        detailed[number] = dict()
#         detailed[number]['numlabel'] = lemmacases[lemma]['Nom'][number]['NO_PREPOSITION']['form']
        detailed[number]['data'] = []
        
        for case in ['ACC','NOM','INS','DAT','LOC','GEN']:
            
            # get totals for simple chart
            totals[lemma][number][case] = dict()
            totals[lemma][number][case]['count'] = 0
            totals[lemma][number][case]['form'] = ''
            # loop over pps for totals
            for pp in lemmacases[lemma][case][number]:
                if numcount[lemma][case][pp] < 3:
                    continue
                totals[lemma][number][case]['count'] += lemmacases[lemma][case][number][pp]['count']
                totals[lemma][number][case]['form'] = lemmacases[lemma][case][number][pp]['form']
            # populate basic dict
            form = totals[lemma][number][case]['form']
            if form == 'ё-таки':
                continue
                
            basic[number]['data'].append({
                'axis' : case.upper(),
                'value' : totals[lemma][number][case]['count'],
                'form' : totals[lemma][number][case]['form'].lower()
            })
            
            # detailed dict
            for pp in set(list(lemmacases[lemma][case]['Sing'].keys())+list(lemmacases[lemma][case]['Plur'].keys())):
                # to REMOVE ANY PREPOSITIONS THAT HAVE LESS THAN 3 ENTRIES!
                if numcount[lemma][case][pp] < 3:
                    continue
                    
                if pp not in lemmacases[lemma][case][number]:
                    count = 0
                    form = ''
                else:
                    count = lemmacases[lemma][case][number][pp]['count']
                    form = lemmacases[lemma][case][number][pp]['form'].lower()
                    
                    # add to count
                    examplecount += lemmacases[lemma][case][number][pp]['count']
                
                axis = pp
                    
                if pp == 'ё-таки':
                    continue
                if pp == 'NO_PREPOSITION':
                    axis = case.upper()
                else:
                    axis = f'{axis} + {case}'
                
                
                detailed[number]['data'].append({
                    'axis' : axis,
                    'value' : count,
                    'form' : form
                })
            
    if examplecount < 10:
        continue
    thisdict['data']['basic'] = basic
    thisdict['data']['detailed'] = detailed
    
    jsonlist.append(thisdict)
print('Successful creation of JSON list.')

Successful creation of JSON list.


In [35]:
# for example
random.choice(jsonlist)

{'label': 'подружка',
 'data': {'basic': {'Sing': {'data': [{'axis': 'ACC', 'value': 0, 'form': ''},
     {'axis': 'NOM', 'value': 7, 'form': 'подружка'},
     {'axis': 'INS', 'value': 3, 'form': 'подружкой'},
     {'axis': 'DAT', 'value': 0, 'form': ''},
     {'axis': 'LOC', 'value': 0, 'form': ''},
     {'axis': 'GEN', 'value': 1, 'form': 'подружки'}]},
   'Plur': {'data': [{'axis': 'ACC', 'value': 0, 'form': ''},
     {'axis': 'NOM', 'value': 4, 'form': 'подружки'},
     {'axis': 'INS', 'value': 1, 'form': 'подружками'},
     {'axis': 'DAT', 'value': 0, 'form': ''},
     {'axis': 'LOC', 'value': 0, 'form': ''},
     {'axis': 'GEN', 'value': 3, 'form': 'подружек'}]}},
  'detailed': {'Sing': {'data': [{'axis': 'NOM',
      'value': 7,
      'form': 'подружка'},
     {'axis': 'с/со + INS', 'value': 3, 'form': 'подружкой'},
     {'axis': 'GEN', 'value': 1, 'form': 'подружки'}]},
   'Plur': {'data': [{'axis': 'NOM', 'value': 4, 'form': 'подружки'},
     {'axis': 'с/со + INS', 'value': 1,

### Write JSON to file

In [37]:
# make json file
import json 

filepath = 'caseradarcharts/1-11-22_case-radar-data.json'

with open(filepath, 'w', encoding='utf8') as json_file:
    json.dump(jsonlist, json_file, ensure_ascii=False)
    
print(filepath,'written successfully.')

caseradarcharts/1-11-22_case-radar-data.json written successfully.


### Separate JSON with examples

In [39]:
# lemma -> case -> number -> examples
# lemmacases[lemma][case][number][pp]['form']
examples = dict()

totalct = 0
for jsonrow in jsonlist:
    lemma = jsonrow['label']
# for lemma in lemmacases:
    examples.setdefault(lemma, dict())
    for case in uppercases:
        examples[lemma][case] = dict()
#         examples[lemma][case]['forms'] = set()
#         examples[lemma][case]['examples'] = dict()
        for pp in set(list(lemmacases[lemma][case]['Sing'].keys())+list(lemmacases[lemma][case]['Plur'].keys())):
            if numcount[lemma][case][pp] < 3:
                continue
            if pp not in lemmacases[lemma][case]['Sing']:
                singlabeled = []
            else:
                singexamples = lemmacases[lemma][case]['Sing'][pp]['examples']
                singlabeled = [ex + ' [Sing]' for ex in singexamples] 
            if pp not in lemmacases[lemma][case]['Plur']:
                plurlabeled = []
            else:
                plurexamples = lemmacases[lemma][case]['Plur'][pp]['examples']
                plurlabeled = [ex + ' [Plur]' for ex in plurexamples]
            random.shuffle(singlabeled)
            random.shuffle(plurlabeled)
            examples[lemma][case][pp] = singlabeled + plurlabeled
#             examples[lemma][case]['forms'].add(lemmacase[lemma][case]['Sing'][pp]['form'])
#             examples[lemma][case]['forms'].add(lemmacase[lemma][case]['Plur'][pp]['form'])

            totalct += len(singlabeled) + len(plurlabeled)

#                 examples[lemma][case][pp] = random.shuffle(examples[lemma][case][pp])

In [40]:
totalct

265964

In [42]:
# folder with sentences of each
for lemma in examples:
    filepath = 'caseradarcharts/1-11_examples2/'+lemma+'.json'

    with open(filepath, 'w', encoding='utf8') as json_file:
        json.dump(examples[lemma], json_file, ensure_ascii=False)