In [None]:
import random
import numpy as np
import pandas as pd
import time
import re
import datetime
import os
# import matplotlib.pyplot as plt
# import gensim
# import spacy

In [None]:
import spacy
# pip install -U spacy
# !python -m spacy download ru_core_news_lg
nlp = spacy.load('ru_core_news_lg')

In [None]:
df = pd.read_excel('Excel_files/Full_Poem_Dataset_9-30_0.xlsx')
print(len(df))
df = df.drop('Unnamed: 0',axis=1)
df.drop_duplicates(subset=['Text'])
print(len(df))
records = df.to_dict('records')

df.head()

In [None]:
df['Before or after'].value_counts()

In [None]:
df[df['Before or after'] == 'Before']['Source'].value_counts()

In [None]:
df[df['Before or after'] == 'After']['Source'].value_counts()

In [None]:
# ### Print some key authors before and after
# with open('херсонскийbefore.txt','w') as f:
#     for b in df[df['Author'] == 'Борис Херсонский'][df['Before or after'] == 'Before'].sample(10).to_dict('records'):
#         f.write('New Poem from '+ b['Source']+'\n\n'+b['Text']+'\n\n')

## Run Spacy

In [None]:
%%time
badIdxs = []
for i, rec in enumerate(records):
    if i % 100 == 0:
        print(f'{i}/{len(records)} parsed by Spacy.')
    try:
        lines = rec['Text'].split('\n')
        docLines = []
        for line in lines:
            docLines.append(nlp(line))
        rec['docLines'] = docLines
        # rec['doc'] = nlp(rec['Text'])
    except:
        badIdxs.append(i)
for i in badIdxs:
    records.pop(i)

In [None]:
# how many tokens?
bTotalTokens = 0
aTotalTokens = 0
for rec in records:
    if rec['Before or after'] == 'Before':
        bTotalTokens += len(rec['docLines'])
    else:
        aTotalTokens += len(rec['docLines'])
bTotalTokens, aTotalTokens

## Text cleaning

In [None]:
def initializeRec(recText):
    if isinstance(recText, str):
        return recText.strip()
    else:
        return False

def skipLine(line, idx):
    if len(line.strip()) == 0:
        return False
    
    # throw hashtag line
    if line.strip()[0] == '#':
        return True
    
    # throw attribution line
    for attr in ['из личного','личный блог','источник:','авторский блог']:
        if attr in line.lower():
            return True
        
    # dots at the beginning
    matches = re.search("[\*\+\^-_][*+^-_= ]+", line.strip()) 
    if matches:
        return True
    
    # is none of it alphanumeric
    containsAlpha = False
    for char in line:
        if char.isalpha():
            containsAlpha = True
            break
    if not containsAlpha:
        return True
    
    return False


def processRec(rec):
    recText = initializeRec(rec['Text'])
    cleanLines = []
    if recText:
        # decide which lines to keep
        lines = recText.split('\n')
        for i, line in enumerate(lines):
            if skipLine(line, i):
                continue
            cleanLines.append(line)
    return '\n'.join(cleanLines)

## Testing out the NLP capabilities

In [None]:
# NER
rec = random.choice(records)

NERcounter = {'counts' : dict(), 'texts' : dict()}
for rec in random.sample(records,10):
    text = processRec(rec)
    doc = nlp(text.replace('\n', '; '))
    for ent in doc.ents:
        lemma = ent.lemma_
        entType = ent.label_
        NERcounter['counts'].setdefault(entType, dict())
        NERcounter['texts'].setdefault(entType, dict())
        NERcounter['counts'][entType].setdefault(lemma, 0)
        NERcounter['texts'][entType].setdefault(lemma, [])
        NERcounter['counts'][entType][lemma] += 1
        NERcounter['texts'][entType][lemma].append(text)

In [None]:
NERcounter['counts']

In [None]:
# proper nouns?
rec = random.choice(records)

PNcounter = {'counts' : dict(), 'texts' : dict()}
for rec in random.sample(records,10):
    text = processRec(rec)
    doc = nlp(text.replace('\n', '; ').lower())
    for token in doc:
        pos = token.pos_
        lemma = token.lemma_
        if pos == 'PROPN':
            print(token, lemma, pos)
        PNcounter['counts'].setdefault(lemma, 0)
        PNcounter['texts'].setdefault(lemma, [])
        PNcounter['counts'][lemma] += 1
        PNcounter['texts'][lemma].append(text)

## Train new NER dataset

In [None]:
# !pip3 install nerus

### Load NERUS corpus

In [None]:
from nerus import load_nerus

In [None]:
NERUS = 'nerus_lenta.conllu.gz'
docs = load_nerus(NERUS)
doc = next(docs)

In [None]:
%time
trainingSamples = []
i = 0
for doc in docs:
    if i % 10000 == 0:
        print(f'{i} docs parsed.')
    for sent in doc.sents:
        text = sent.text
        entityDict = dict()
        entityDict['entities'] = []
        for span in sent.ner.spans:
            entityDict['entities'].append((span.start, span.stop, span.type))

        tokens = text.split(' ')
        newTokens = []
        for t in tokens:
            if t.isupper() and len(t) > 1:
                newCase = t
            else:
                newCase = t.lower()
            newTokens.append(newCase)
        newText = ' '.join(newTokens)

        datum = (newText, entityDict)
        trainingSamples.append(datum)
    i += 1

### Train SpaCY pipeline

In [None]:
# how to clean in such a way where NERs are detected
ner=nlp.get_pipe("ner")

In [None]:
random.choice(trainingSamples)

In [None]:
# TRAIN_DATA = [
    # ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]})
    # ]
    
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [None]:
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = dict()
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

In [None]:
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

### Save model

In [None]:
# Save the  model to directory
output_dir = Path('/content/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])