## Using the RNC to look at constructions in certain windows of verbs

In [1]:
import random

In [2]:
from corus import load_morphoru_rnc

path = 'RNCgoldInUD_Morpho.conll'
records = load_morphoru_rnc(path)
rnccorpus = []
for record in records:
    rnccorpus.append(record)
len(rnccorpus)

98892

## build sentences

In [12]:
def build_sentence(tokens, upperlist=[]):
    sentence = ''
    for i, token in enumerate(tokens):
        if token.pos == 'PUNCT':
            space = ''
        else:
            space = ' '
        if i in upperlist:
            text = token.text.upper()
        else:
            text = token.text
        if text == None:
            text = ''
        sentence = sentence + space + text
    return sentence.strip()
adpdict = {
    'в' : 'в/во',
    'во' : 'в/во',
    'с' : 'с/со',
    'со' : 'с/со',
    'о' : 'о/об',
    'об' : 'о/об',
    'обо' : 'о/об'
}
# sample = random.sample(rnccorpus,1000)
# sample = rnccorpus[35557:35558]
sample = rnccorpus
windowList = []
for window in range(-3, 4):
    dataDict = dict()
    for sent in sample:
        currentindex = 0
        for token in sent.tokens:
            cxfound = False
            if token.pos == 'VERB':
                verblemma = token.lemma
                dataDict.setdefault(verblemma, dict())
                dataDict[verblemma].setdefault('counts',dict())
                dataDict[verblemma].setdefault('sentences',dict())
                
                adpindex = currentindex + window
                if adpindex >= 0 and adpindex < len(sent.tokens):
                    adptoken = sent.tokens[adpindex]
                    if adptoken.text != ',' and adptoken.pos == 'PUNCT' and adpindex + 1 >= 0 and adpindex + 1 < len(sent.tokens):
                        adpindex = adpindex + 1
                    if adptoken.pos == 'ADP':
                        adplemma = adptoken.lemma
                        # lemmatize prepositions
                        if adplemma in adpdict:
                            adplemma = adpdict[adplemma]
                        caseindex = adpindex + 1
                        if caseindex >= 0 and caseindex < len(sent.tokens):
                            casetoken = sent.tokens[caseindex]
                            if casetoken.pos == 'PUNCT' and caseindex + 1 >= 0 and caseindex + 1 < len(sent.tokens):
                                caseindex = caseindex + 1
                            casetoken = sent.tokens[caseindex]
                            try:
                                case = casetoken.feats['Case']
                                if case == 'Ins':
                                    case = 'Inst'
                                adpandcase = adplemma + ' + ' + case.upper()
                                cxfound = True    
                            except:
                                cxfound = False
                    elif window == 1 and adptoken.text == ',':
                        shtoindex = adpindex + 1
                        if shtoindex >= 0 and shtoindex < len(sent.tokens):
                            shtotoken = sent.tokens[shtoindex]
                            if shtotoken.text == 'что':
                                adpandcase = ', что'
                                caseindex = shtoindex
                                cxfound = True
                    else:
                        if window in [-1,1]:
                            # not an adposition directly before or after verb
                            try: 
                                if adpindex - 1 >= 0:
                                    # need to make sure prev token is not adp
                                    if sent.tokens[adpindex - 1].pos != 'ADP':
                                        case = adptoken.feats['Case']
                                        if case == 'Ins':
                                            case = 'Inst'
                                        adpandcase = case.upper()
                                        caseindex = currentindex
                                        cxfound = True    
                            except:
                                # check if infinitive, if so, set caseindex to current index
                                if adptoken.pos == 'VERB':
                                    if 'VerbForm' in adptoken.feats and (adptoken.feats['VerbForm'] == 'Inf'):
                                        adpandcase = 'INFINITIVE'
                                        caseindex = currentindex
                                        cxfound = True
                if cxfound:
                    dataDict[verblemma]['counts'].setdefault(adpandcase, 0)
                    dataDict[verblemma]['sentences'].setdefault(adpandcase, [])
                    dataDict[verblemma]['counts'][adpandcase] += 1
                    formattedsent = build_sentence(sent.tokens, [currentindex, adpindex, caseindex])
                    if formattedsent not in dataDict[verblemma]['sentences'][adpandcase]:
                        dataDict[verblemma]['sentences'][adpandcase].append(formattedsent)
            currentindex += 1
    print(f'window size of {window} complete...')
    windowList.append(dataDict)

window size of -3 complete...
window size of -2 complete...
window size of -1 complete...
window size of 0 complete...
window size of 1 complete...
window size of 2 complete...
window size of 3 complete...


In [13]:
windowList[4]['знать']['sentences'][', что'][:3]

['Я ЗНАЮ, ЧТО с этим делать',
 'а если нет- то хотя бы примерно ЗНАТЬ, ЧТО там',
 'В общем, я не ЗНАЮ, ЧТО там, и не знаю, будешь ли ты их кому то- то передавать дальше- но не говори о том что это сделал я, особенно моей группе.']

In [14]:
# removing uncommon verbs (<5 occurrences)
uncommonVerbs = set()
totalcount = dict()
for windowDict in windowList:
    for verb in windowDict:
        totalcount.setdefault(verb,0)
        for cx in windowDict[verb]['counts']:
            totalcount[verb] += windowDict[verb]['counts'][cx]
for verb in totalcount:
    if totalcount[verb] < 5:
        uncommonVerbs.add(verb)
print(len(uncommonVerbs))

5898


In [75]:
for verbtoremove in uncommonVerbs:
    for windowDict in windowList:
        try:
            del windowDict[verbtoremove]
        except:
            False

In [76]:
len(windowList[1])

3410

## for csv

In [77]:
allcxlabels = set()
for windowDict in windowList:
    for verb in windowDict:
        countdict = windowDict[verb]['counts']
        for cxlabel in countdict:
            allcxlabels.add(cxlabel)
allcxlabels = list(allcxlabels)

In [78]:
# getting rid of cxx with count < 10 occurrences across verbs
totalcxcounts = dict()
for windowDict in windowList:
    for verb in windowDict:
        countdict = windowDict[verb]['counts']
        for cx in countdict:
            totalcxcounts.setdefault(cx, 0)
            totalcxcounts[cx] += 1
lst = []
for cx in totalcxcounts:
    lst.append((totalcxcounts[cx],cx))
i = 0
greaterthan10cx = []
for ct, cx in lst:
    if ct > 10:
        greaterthan10cx.append(cx)

In [79]:
# verb, windowsize, prep1, prep2, prep3, prep4, prep5...
rows = []
cxused = greaterthan10cx
for windowIndex, windowDict in enumerate(windowList):
    windowSize = windowIndex - 3
    for verb in windowDict:
        countdict = windowDict[verb]['counts']
        csvline = [verb, windowSize]
        for cxlabel in cxused:
            if cxlabel in countdict:
                count = countdict[cxlabel]
            else:
                count = 0
            csvline.append(count)
        rows.append(csvline)

In [80]:
r = random.choice(rows)
print(r)

['усвоить', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 0, 0, 2]


In [81]:
fields = ['Verb','WindowSize']

for cxlabel in greaterthan10cx:
    fields.append(cxlabel)

In [62]:
import csv
# writing to csv file 
filename = '8-16-21csvdata.csv'
with open(filename, 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
        
    # writing the fields 
    csvwriter.writerow(fields) 
        
    # writing the data rows 
    csvwriter.writerows(rows)

In [64]:
pwd

'/Users/paigelee/Desktop/spring2021/clancy/verbhistograms'

In [82]:
sentenceDict = dict()
windowSize = -3
for windowDict in windowList:
    sentenceDict[windowSize] = dict()
    for verb in windowDict:
        sentenceDict[windowSize][verb] = dict()
        for cx in windowDict[verb]['sentences']:
            sentenceDict[windowSize][verb][cx] = windowDict[verb]['sentences'][cx][:10]
    windowSize += 1

In [83]:
totalsents = 0
for windowDict in windowList:
    for verb in windowDict:
        for prep in windowDict[verb]['sentences']:
            totalsents += len(windowDict[verb]['sentences'][prep])
print(totalsents)

154787


In [84]:
verbDict = dict()

for windowDict in windowList:
    for verb in windowDict:
        verbDict[verb] = dict()
windowSize = -3
for windowDict in windowList:
    for verb in windowDict:
        verbDict[verb][windowSize] = dict()
        for cx in windowDict[verb]['sentences']:
            verbDict[verb][windowSize][cx] = list()
            for sent in windowDict[verb]['sentences'][cx]:
                verbDict[verb][windowSize][cx].append(sent)
    windowSize += 1

In [85]:
import json
for verb in verbDict:
    with open(f'sentdata8/{verb}.json', 'w', encoding='utf8') as jsonfile:
        json.dump(verbDict[verb], jsonfile, ensure_ascii=False)

# get most common constructions

In [95]:
def build_sentence(tokens, upperlist):
    sentence = ''
    for i, token in enumerate(tokens):
        if token.pos == 'PUNCT':
            space = ''
        else:
            space = ' '
        if i in upperlist:
            text = token.text.upper()
        else:
            text = token.text
        if text == None:
            text = ''
        sentence = sentence + space + text
    return sentence.strip()
adpdict = {
    'в' : 'в/во',
    'во' : 'в/во',
    'с' : 'с/со',
    'со' : 'с/со',
    'о' : 'о/об',
    'об' : 'о/об',
    'обо' : 'о/об'
}
# sample = random.sample(rnccorpus,1000)
# sample = rnccorpus[35557:35558]
sample = rnccorpus

orderedDict = dict()
wordDict = dict()
commonDict = dict()
for i, sent in enumerate(sample):
    if i % 10000 == 0:
        print(f'{i}/{len(sample)} sentences parsed...')
    currentindex = 0
    for token in sent.tokens:
        # for each verb found
        if token.pos == 'VERB':
            verblemma = token.lemma
            orderedDict.setdefault(verblemma, dict())
            wordDict.setdefault(verblemma, dict())
            commonDict.setdefault(verblemma, dict())
            allcxx = []
            before = []
            after = []
            for window in range(-3,4):
                cxfound = False
                adpindex = currentindex + window
                
                if adpindex >= 0 and adpindex < len(sent.tokens):
                    adptoken = sent.tokens[adpindex]
                    try:
                        if window != 0:
                            wordtoken = adptoken.text.lower()
                            wordDict[verblemma].setdefault(wordtoken, dict())
                            wordDict[verblemma][wordtoken].setdefault('counts',0)
                            wordDict[verblemma][wordtoken].setdefault('sentences',[])
                            wordDict[verblemma][wordtoken]['counts'] += 1
                            wordDict[verblemma][wordtoken]['sentences'].append(build_sentence(sent.tokens, [currentindex, adpindex]))
                    except:
                        False
                    if adptoken.text != ',' and adptoken.pos == 'PUNCT' and adpindex + 1 >= 0 and adpindex + 1 < len(sent.tokens):
                        adpindex = adpindex + 1
                    if adptoken.pos == 'ADP':
                        adplemma = adptoken.lemma
                        # lemmatize prepositions
                        if adplemma in adpdict:
                            adplemma = adpdict[adplemma]
                        caseindex = adpindex + 1
                        if caseindex >= 0 and caseindex < len(sent.tokens):
                            casetoken = sent.tokens[caseindex]
                            if casetoken.pos == 'PUNCT' and caseindex + 1 >= 0 and caseindex + 1 < len(sent.tokens):
                                caseindex = caseindex + 1
                            casetoken = sent.tokens[caseindex]
                            try:
                                case = casetoken.feats['Case']
                                if case == 'Ins':
                                    case = 'Inst'
                                adpandcase = adplemma + ' + ' + case.upper()
                                cxfound = True    
                            except:
                                cxfound = False
                    elif window == 1 and adptoken.text == ',':
                        shtoindex = adpindex + 1
                        if shtoindex >= 0 and shtoindex < len(sent.tokens):
                            shtotoken = sent.tokens[shtoindex]
                            if shtotoken.text == 'что':
                                adpandcase = '[ , что ]'
                                caseindex = shtoindex
                                cxfound = True
                    else:
                        if window in [-1,1]:
                            # not an adposition directly before or after verb
                            try: 
                                if adpindex >= 0:
                                    # need to make sure prev token is not adp
                                    if sent.tokens[adpindex - 1].pos != 'ADP':
                                        case = adptoken.feats['Case']
                                        if case == 'Ins':
                                            case = 'Inst'
                                        adpandcase = case.upper()
                                        caseindex = currentindex
                                        cxfound = True
                            except:
                                # check if infinitive, if so, set caseindex to current index
                                if adptoken.pos == 'VERB':
                                    if 'VerbForm' in adptoken.feats and (adptoken.feats['VerbForm'] == 'Inf'):
                                        adpandcase = 'INFINITIVE'
                                        caseindex = currentindex
                                        cxfound = True
                
                if cxfound:
                    allcxx.append((adpandcase,adpindex,caseindex))
                    if window < 0:
                        before.append((adpandcase, adpindex, caseindex))
                    else:
                        after.append((adpandcase, adpindex, caseindex))
                    orderedcx = ''
                    allindices = [currentindex]
                    if before != []:
                        for cx, adpindex, caseindex in before:
                            if '+' in cx:
                                cx = '[ '+cx+' ]'
                            orderedcx = orderedcx + cx + ' + '
                            allindices.append(adpindex)
                            allindices.append(caseindex)
                    orderedcx = orderedcx + verblemma
                    if after != []:
                        for i, (cx, adpindex, caseindex) in enumerate(after):
                            if '+' in cx:
                                cx = '[ '+cx+' ]'
                            if i == 0:
                                orderedcx = orderedcx + ' + ' + cx
                            else:
                                orderedcx = orderedcx + ' / ' + cx
                            allindices.append(adpindex)
                            allindices.append(caseindex)

                    orderedDict[verblemma].setdefault(orderedcx, dict())
                    orderedDict[verblemma][orderedcx].setdefault('counts',0)
                    orderedDict[verblemma][orderedcx].setdefault('sentences',set())
                    orderedDict[verblemma][orderedcx]['counts'] += 1
                    orderedDict[verblemma][orderedcx]['sentences'].add(build_sentence(sent.tokens, allindices))
                
                    for (cx, adpindex, caseindex) in allcxx:
                        if '+' in cx:
                            cx = '[ '+cx+' ]'
                        commonDict[verblemma].setdefault(cx, dict())
                        commonDict[verblemma][cx].setdefault('counts',0)
                        commonDict[verblemma][cx].setdefault('sentences',set())
                        commonDict[verblemma][cx]['counts'] += 1
                        commonDict[verblemma][cx]['sentences'].add(build_sentence(sent.tokens, [currentindex, adpindex, caseindex]))
        currentindex += 1

0/98892 sentences parsed...
10000/98892 sentences parsed...
20000/98892 sentences parsed...
30000/98892 sentences parsed...
40000/98892 sentences parsed...
50000/98892 sentences parsed...
60000/98892 sentences parsed...
70000/98892 sentences parsed...
80000/98892 sentences parsed...
90000/98892 sentences parsed...


In [66]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")
print(russian_stopwords[:5])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paigelee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [103]:
# get highest numbered things / critical number
jsonDict = dict()
for verb in commonDict:
    if verb in uncommonVerbs:
        continue
    jsonDict.setdefault(verb, dict())
    # commonDict
    cxlist = []
    for cx in commonDict[verb]:
        cxlist.append((commonDict[verb][cx]['counts'], cx))
    sortedlst = sorted(cxlist, reverse=True)
    newcx = verb
    for ct, cx in sortedlst[:3]:
        if ct < 5:
            continue
        if cx == 'NOM':
            newcx = cx + ' + ' + newcx
        else:
            newcx = newcx + ' + ' + cx
    if 'NOM' not in newcx:
        if 'NOM' in [cx for ct, cx in sortedlst[:5]]:
            newcx = 'NOM + ' + newcx
    jsonDict[verb]['aggregate'] = newcx
    
    # top words
    wordlst = []
    for word in wordDict[verb]:
        if word.isalpha() and word not in russian_stopwords:
            count = wordDict[verb][word]['counts']
            if count > 4:
                wordlst.append((count, word))
    sortedwordlst = sorted(wordlst, reverse=True)
    jsonDict[verb]['relatedwords'] = sortedwordlst[:20]
    
    # top cxx
    cxlst = []
    for cx in orderedDict[verb]:
        if cx == 'NOM + ' + verb:
            continue
        count = orderedDict[verb][cx]['counts']
        if count < 5:
            continue
        cxlst.append((count, cx))
    sortedcxlst = sorted(cxlst, reverse=True)
    cxxlist = [cx for cx in sortedcxlst if ' ' in cx[1]]
    jsonDict[verb]['separated'] = sortedcxlst[:3]
    
jsonList = [jsonDict]

In [106]:
import json
with open(f'common_constructions_8-17.json', 'w', encoding='utf8') as jsonfile:
    json.dump(jsonList, jsonfile, ensure_ascii=False)