In [1]:
import json
import pandas as pd
from pymystem3 import Mystem

# !pip3 install spacy 
# !python -m spacy download ru_core_news_sm
import spacy
import re
from string import punctuation
import random

In [2]:
mystem = Mystem()
def lemm_phrase(x):
    tokens = mystem.lemmatize(x)
    tokens = [token for token in tokens[:-1] if (token.strip() not in [" ", ""])]
    return " ".join(tokens)

# Work with generated data

In [None]:
df = pd.read_csv("dataset/generated_data.csv")

In [None]:
pattern = re.compile(r'"|\'|\.|,|\\|\/|-|\(|\)|,')

df['new_text'] = df['text'].apply(lambda x: re.sub(pattern=pattern, repl='', string=x.lower()).strip())
df['text'] = df['text'].apply(lambda x: re.sub(pattern=pattern, repl='', string=x).strip())

In [None]:
df['new_text'] = df['new_text'].apply(lambda x: lemm_phrase(x))

In [None]:
df

In [None]:
# df.to_csv('dataset/final_data_lower.csv', index=False)

# Read New Dataset

In [None]:
df = pd.read_csv("dataset/final_data_lower.csv")

In [None]:
pattern = re.compile(r':')
df['text'] = df['text'].apply(lambda x: re.sub(pattern=pattern, repl=' : ', string=x).strip())
df['new_text'] = df['new_text'].apply(lambda x: re.sub(pattern=pattern, repl=' : ', string=x).strip())

In [None]:
df['new_text'] = df['new_text'].apply(lambda x: ' '.join([i for i in x.split() if i != '']))
df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i != '']))

In [None]:
df

# Use spacy

In [None]:
from spacy.lang.ru import Russian
from spacy.pipeline import EntityRuler

In [None]:
tasks = [
    'создавать задача',
    'создавать заметка',
    'создавать задание',
    'создавать поручение',
    'напоминать',
    'запланировать встреча',
    'назначать обязанность',
    'ставить задача',
    'задавать задача',
    'давать поручение',
    'давать задача',
    'добавлять задача',
    'поставлять задача'
]

tags = {
    "Task": 'task_type',
    "ToDo": 'todo',
    "Person": 'person',
    "Time": 'time',
    "Garbage": 'garbage'
}

In [None]:
def create_training_data(data, type):
    data = data
    patterns = []
    for item in data:
        pattern = {"label": type, "pattern": item}
        # generate list of patterns
        patterns.append(pattern)
    return patterns

In [None]:
nlp = Russian()
ruler = EntityRuler(nlp)
ruler = nlp.add_pipe("entity_ruler")
def generate_rules(patterns, name):
    ruler.add_patterns(patterns)
    # nlp.to_disk(name)

In [None]:
def test_model(model, text):
    doc = model(text)
    results = []
    entities = []

    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return results
"""
TRAIN DATA FOR SPACY = [
        (   text, 
            {"entities": 
                [(start, end, label), ...]
            }
        )
    ]
"""

In [None]:
def get_train_type(model, x, arr):
    results = test_model(model, x)
    if results != None:
        arr.append(results)

# Выделение типа задачи

In [None]:
task_patterns = create_training_data(tasks, tags['Task'])
generate_rules(patterns=task_patterns, name='task_types')

# Выделение имен

In [None]:
# df_fem = pd.read_csv('dataset/female_names.csv')
# df_male = pd.read_csv('dataset/male_names.csv')
# df_names = df_fem.append(df_male)
# df_names.to_csv('dataset/names.csv', index=False)

In [None]:

# df_names['name'] = df_names['name'].apply(lambda x: x.lower())
# df_names['name'] = df_names['name'].apply(lambda x: lemm_phrase(x))

In [None]:
df_names = pd.read_csv('dataset/names.csv')
person_patterns = create_training_data(list(df_names['name']), tags['Person'])
generate_rules(patterns=person_patterns, name='person')

# Выделение времени

In [None]:
df_time = pd.read_csv('dataset/with_months.csv')

In [None]:
# df_time.to_csv('dataset/time.csv', index=False)

In [None]:
# df_time['time'] = df_time['time'].apply(lambda x: x.lower())
# df_time['lower_time'] = df_time['time'].apply(lambda x: lemm_phrase(x))

In [None]:
time_patterns = create_training_data(list(df_time['lower_time']), tags['Time'])
generate_rules(patterns=time_patterns, name='time')

# Model with all entities

In [None]:
nlp.to_disk('final_all_entities')

# Train Data

In [None]:
TRAIN_DATA = []
df['new_text'].apply(lambda x: get_train_type(nlp, x, TRAIN_DATA))

In [None]:
from spacy.training.example import Example

In [None]:
def train(data, epochs=30):
    TRAIN_DATA = data
    nlp = spacy.blank("ru")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe("ner", last=True)

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2]) # taking label for ex. person

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itr in range(epochs):
            print("Start Epoch - " + str(itr))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update(
                        [example],
                        drop=0.2,
                        sgd=optimizer,
                        losses=losses
                    )
            print(losses)
    return nlp


In [None]:
nlp = train(TRAIN_DATA)
nlp.to_disk('final_2')

# Use wihtout final model

In [3]:
def clean_str(phrase):
    pattern = re.compile(r'"|\'|\.|,|\\|\/|-|\(|\)|,')
    phrase = re.sub(pattern=pattern, repl='', string=phrase.lower()).strip()
    final = lemm_phrase(phrase)
    return final

In [4]:
# nlp.to_disk('result_final_all')
nlp = spacy.load('final_2')

In [5]:
# Вывод результата к красивом виде
def get_results(text):
    result = []
    idx = []

    # подготовка фразы
    cleaned = clean_str(text)
    tmp = list(zip(text.split(), cleaned.replace(' : ', ':').split()))
    doc = nlp(cleaned)
    for ent in doc.ents:
        words = cleaned[ent.start_char:ent.end_char].split()
        res = []
        for w in words:
            for t, i in enumerate(tmp):
                if w in i[1]:
                    idx.append(t)
                    res.append(i[0])
        result.append((' '.join(res), ent.label_))

    # получаем "название для задачи" - то, что не разметилось = само задание
    task = [k[0] for t, k in enumerate(tmp) if t not in idx]

    # объединяем выделенные объекты
    result = sorted(result, key=lambda x: x[1])
    final = {}
    key = None
    for i in result:
        if i[1] != key:
            key = i[1]
            final[key] = i[0]
        else:
            final[key] = final[key] + ', ' + i[0]
    task = ' '.join(task)
    return final, task

In [6]:
texts = [
    'Поставь задачу: приготовить ужин в 7 вечера на Александра и Дмитрия',
    'Напомни выгулять собаку 19 декабря',
    'Создай задачу на Екатерину провести планирование в 5 часов'
    ]

for text in texts:
    print(get_results(text))

({'person': 'Александра, Дмитрия', 'task_type': 'Поставь задачу:', 'time': '7, вечера'}, 'приготовить ужин в на и')
({'task_type': 'Напомни', 'time': '19, декабря'}, 'выгулять собаку')
({'person': 'Екатерину', 'task_type': 'Создай задачу', 'time': '5, часов'}, 'на провести планирование в')
