In [3]:
import pandas as pd
import random
import json
import csv
from tqdm import tqdm

In [4]:
df = pd.read_excel('../Excel_files/Full_Poem_Dataset_12-17.xlsx')
df = df.drop(columns=['Unnamed: 0'])
records = df.to_dict('records')
random.choice(records)

{'Text': 'москвичи по газону не ходят\nмосквичи под газоном лежат\nо копающий ямочку котик\nдай оттудова лапу пожать\nесли надо копать то конечно же\nвзяв лопату копай молчи\nв этой почве есть твёрдые нежные\nкирпичи\nчерепа\nмосквичи',
 'Author': 'Андрей Чемоданов',
 'Before or after': 'Before',
 'Source': 'essentialpoetry',
 'Date posted': datetime.datetime(2019, 11, 1, 0, 0),
 'UniqueIndex': 2800}

## Word Co-occurrence

### Initialize Spacy models

In [5]:
import spacy

In [5]:
# !python -m spacy download uk_core_news_lg

In [6]:
# !python -m spacy download ru_core_news_lg

In [6]:
nlp = spacy.load("ru_core_news_lg", disable=['attribute_ruler','ner'])

### Generate Docs

In [11]:

for rec in tqdm(records):
    recTxt = rec['Text']
    if not isinstance(recTxt,str):
        recTxt = str(recTxt)
    doc = nlp(recTxt)
    rec['spacydoc'] = doc

100%|███████████████████████████████████████| 3289/3289 [02:44<00:00, 19.98it/s]


### Run cooc parser

In [12]:
import itertools
import string

In [18]:
from spacy.lang.ru import stop_words
ru_stops = stop_words.STOP_WORDS

from spacy.lang.en import stop_words
en_stops = stop_words.STOP_WORDS

from spacy.lang.uk import stop_words
uk_stops = stop_words.STOP_WORDS

from spacy.lang.es import stop_words
es_stops = stop_words.STOP_WORDS

from spacy.lang.pl import stop_words
pl_stops = stop_words.STOP_WORDS

stopwords = list(ru_stops)+list(uk_stops)
foreign_stops = list(en_stops)+list(es_stops)+list(pl_stops)
short_stops = [s for s in stopwords if len(s) < 3]
short_ru_stops = [s for s in ru_stops if len(s) < 3]

In [14]:
def isLemma(txt):
    for p in string.punctuation+'—'+'–':
        if p in txt:
            return False
    if txt.isspace():
        return False
    if txt in foreign_stops:
        return False
    # if txt in short_stops:
    #     return False
    if txt in stopwords:
        return False
    return True

In [17]:
isLemma('говорить')

False

In [42]:
lens=set()
lenCts = dict()

networkJson = dict()
networkJson['nodes'] = []
networkJson['links'] = []

curatedNodes = set()
linkCounter = dict()
lemmaCounter = dict()

print('finding sufficient nodes...')

for rec in tqdm(records):
    recTxt = rec['Text']

    doc = rec['spacydoc']
    for t1 in doc:
        l1 = t1.lemma_.lower()
        lemmaCounter.setdefault(l1, 0)
        lemmaCounter[l1] += 1

# find those nodes which occur enough times
sufficientNodes = set()
for lemma in lemmaCounter:
    if lemmaCounter[lemma] >= 5 and isLemma(lemma):
        sufficientNodes.add(lemma.lower())


label2lines = dict()
label2recs = dict()
label2authors = dict()

print('parsing links...')
for rec in tqdm(records):
    doc = rec['spacydoc']
    newdoc = [t for t in doc if (t.text.isalpha() or t.text == '\n')]
    doclen = len(newdoc)
    # # line jumping
    allLines = []
    newLine = []
    for i, t in enumerate(newdoc):
        if t.text == '\n':
            allLines.append(newLine)
            newLine = []
        elif i == doclen-1:
            allLines.append(newLine)
        else:
            newLine.append(t)
    numLines = len(allLines)
    windowlength = 2

    labelFoundFromLine = dict()
    for i1 in range(numLines-windowlength):
        # all tokens in the desired range
        tokensInWindow = []
        excerptLines = []
        for line in allLines[i1:i1+windowlength]:
            excerptLines.append(' '.join([t.text for t in line]))
            for token in line:
                tokensInWindow.append(token)
        tiw = len(tokensInWindow)
        lens.add(tiw)
        lenCts.setdefault(tiw,0)
        lenCts[tiw] += 1
        if tiw > 50:
            continue

        for ic, (t1, t2) in enumerate(itertools.combinations(tokensInWindow,2)):
            l1 = t1.lemma_.lower()
            l2 = t2.lemma_.lower()
            if l1 != l2 and l1 in sufficientNodes and l2 in sufficientNodes:
                label = 'AND'.join(sorted([l1, l2]))

                ### label already found in line?
                repeatedInstance = False
                for line in excerptLines:
                    labelFoundFromLine.setdefault((label, line), False)
                    if labelFoundFromLine[(label, line)]:
                        repeatedInstance = True
                    else:
                        labelFoundFromLine[(label, line)] = True
                if repeatedInstance:
                    continue

                label2authors.setdefault(label, set())
                label2authors[label].add(rec['Author'])

                label2recs.setdefault(label, [])
                if rec not in label2recs[label]:
                    label2recs[label].append(rec)

                excerpt = '\n'.join(excerptLines)
                label2lines.setdefault(label, [])
                if excerpt not in [e['excerpt'] for e in label2lines[label]]:
                    # increase link Ct
                    linkCounter.setdefault(label, 0)
                    linkCounter[label] += 1
                    period = rec['Before or after']
                    if pd.isna(period):
                        period = ''
                    author = rec['Author']
                    if pd.isna(author):
                        author = 'Unknown'
                    label2lines[label].append({
                        'excerpt' : excerpt,
                        'author' : author,
                        'period' : period,
                        'uniqueIndex' : rec['UniqueIndex']
                    })

node2id = dict()
for i, node in enumerate(sufficientNodes):
    networkJson['nodes'].append({
        'id' : node,
        'totalinstances' : lemmaCounter[node]
    })
    node2id[node] = i

for label in linkCounter:
    source = label.split('AND')[0]
    target = label.split('AND')[1]
    if linkCounter[label] > 4:
        networkJson['links'].append({
            'source' : node2id[source],
            'target' : node2id[target],
            'sourceLemma' : source,
            'targetLemma' : target,
            'linkCt' : linkCounter[label],
            'authorCt' : list(label2authors[label])
        })
print(sorted([l['linkCt'] for l in networkJson['links']],reverse=True)[:10])
with open(f'wordnet_2_lines_all_stops_21.json', 'w') as json_file:
    json.dump(networkJson, json_file, ensure_ascii = False, indent=4)
with open(f'label2lines_21.json', 'w') as json_file:
    json.dump(label2lines, json_file, ensure_ascii=False, indent=4)

finding sufficient nodes...


100%|█████████████████████████████████████| 3289/3289 [00:00<00:00, 3388.17it/s]


parsing links...


100%|██████████████████████████████████████| 3289/3289 [00:12<00:00, 253.82it/s]


[72, 51, 47, 44, 43, 41, 40, 40, 40, 34]


In [43]:
sum(sorted([l[''] for l in networkJson['links']],reverse=True))/len(networkJson['links'])

7.4519522557151525

In [44]:
len(networkJson['links'])

4943

In [35]:
a = random.choice([l for l in networkJson['links'] if l['linkCt'] > 1])
print(a)
print()
for l in label2lines[a['sourceLemma']+'AND'+a['targetLemma']]:
    print(l['excerpt'])
    print()

{'source': 4502, 'target': 1881, 'sourceLemma': 'менять', 'targetLemma': 'форма', 'linkCt': 2, 'authorCt': ['Виктор Плавский', 'Василий Бородин']}

Видимо когда что то привычное
меняет свою форму

неразрывного одного объема
чуть меняющего форму когда в него бьют



In [36]:
label2lines[a['sourceLemma']+'AND'+a['targetLemma']]

[{'excerpt': 'Видимо когда что то привычное\nменяет свою форму',
  'author': 'Виктор Плавский',
  'period': 'After',
  'uniqueIndex': 172},
 {'excerpt': 'неразрывного одного объема\nчуть меняющего форму когда в него бьют',
  'author': 'Василий Бородин',
  'period': 'Before',
  'uniqueIndex': 2896}]