In [7]:
!pip install transformers torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "dylandilu/EventExtraction-BART-large"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def extract_events(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

text = """
A magnitude 6.2 earthquake struck California on Monday,
causing several buildings to collapse and injuring dozens.
"""

print(extract_events(text))


^C


KeyboardInterrupt: 

In [None]:
from transformers import pipeline

qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

text = """
The hurricane destroyed several houses near the coast in Florida.
Many families were evacuated by emergency services.
"""

questions = [
    "What event happened?",
    "Who caused the event?",
    "What was destroyed?",
    "Where did the event happen?",
    "Who was affected?",
]

for q in questions:
    print(q, "->", qa(question=q, context=text)["answer"])


In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)

# Пример шаблона: глагол + объект = событие
event_pattern = [
    {"POS": "VERB"},
    {"POS": "NOUN", "OP": "?"},
    {"POS": "PROPN", "OP": "?"}
]

matcher.add("EVENT_PATTERN", [event_pattern])

def extract_rule_based(text):
    doc = nlp(text)
    matches = matcher(doc)

    events = []
    for match_id, start, end in matches:
        span = doc[start:end]
        events.append(span.text)
    return events

text = "An explosion damaged three cars near the central station."

print(extract_rule_based(text))


In [None]:
!pip install transformers datasets seqeval

from datasets import load_dataset
dataset = load_dataset("maven")

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from seqeval.metrics import classification_report

labels = dataset["train"].features["labels"].feature.names
id2label = {i: l for i, l in enumerate(labels)}
label2id = {l: i for i, l in enumerate(labels)}

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, word_id in enumerate(tokenized.word_ids()):
        if word_id is None:
            labels.append(-100)
        else:
            labels.append(example["labels"][word_id])
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased", num_labels=len(labels), id2label=id2label, label2id=label2id
)

args = TrainingArguments(
    "event-extraction",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)

trainer.train()


In [10]:

from transformers import pipeline

qa_ru = pipeline("question-answering", model="DeepPavlov/rubert-base-cased")

text = """
В результате взрыва на заводе пострадали пять человек.
Пожарные и спасатели прибыли на место происшествия.
"""

questions = [
    "Что произошло?",
    "Где произошло событие?",
    "Кто пострадал?",
    "Кто участвовал в ликвидации?",
]

for q in questions:
    print(q, "->", qa_ru(question=q, context=text)["answer"])


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Что произошло? -> Пожарные и спасатели прибыли на место
Где произошло событие? -> Пожарные и спасатели прибыли на место
Кто пострадал? -> Пожарные и спасатели прибыли на место
Кто участвовал в ликвидации? -> Пожарные и спасатели прибыли на место
