# Снятие омонимии с использованием CRF

## Импорты

In [None]:
#!pip install datasets

In [None]:
#!pip install sklearn-crfsuite

In [None]:
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from tqdm import tqdm


import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import os
import time
import tqdm
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## from datasets import load_dataset
dataset = load_dataset("conll2003")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
# https://huggingface.co/datasets/conll2003

In [None]:
import pandas as pd

train_df = dataset["train"].to_pandas()
validation_df = dataset["validation"].to_pandas()
test_df = dataset["test"].to_pandas()
df = pd.concat([train_df, validation_df, test_df], ignore_index=True)
print(df.info())

df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20744 entries, 0 to 20743
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          20744 non-null  object
 1   tokens      20744 non-null  object
 2   pos_tags    20744 non-null  object
 3   chunk_tags  20744 non-null  object
 4   ner_tags    20744 non-null  object
dtypes: object(5)
memory usage: 810.4+ KB
None


Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[1, 2]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[5, 0]"
3,3,"[The, European, Commission, said, on, Thursday...","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,4,"[Germany, 's, representative, to, the, Europea...","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


## Подготовка данных

In [None]:
train_df = dataset["train"].to_pandas()
validation_df = dataset["validation"].to_pandas()
test_df = dataset["test"].to_pandas()

In [None]:
# Словари для замены числовых индексов на теги
pos_tags_dict = {
    0: '"', 1: "''", 2: '#', 3: '$', 4: '(', 5: ')', 6: ',', 7: '.', 8: ':', 9: '``',
    10: 'CC', 11: 'CD', 12: 'DT', 13: 'EX', 14: 'FW', 15: 'IN', 16: 'JJ', 17: 'JJR',
    18: 'JJS', 19: 'LS', 20: 'MD', 21: 'NN', 22: 'NNP', 23: 'NNPS', 24: 'NNS',
    25: 'NN|SYM', 26: 'PDT', 27: 'POS', 28: 'PRP', 29: 'PRP$', 30: 'RB', 31: 'RBR',
    32: 'RBS', 33: 'RP', 34: 'SYM', 35: 'TO', 36: 'UH', 37: 'VB', 38: 'VBD',
    39: 'VBG', 40: 'VBN', 41: 'VBP', 42: 'VBZ', 43: 'WDT', 44: 'WP', 45: 'WP$', 46: 'WRB'
}

chunk_tags_dict = {
    0: 'O', 1: 'B-ADJP', 2: 'I-ADJP', 3: 'B-ADVP', 4: 'I-ADVP', 5: 'B-CONJP', 6: 'I-CONJP',
    7: 'B-INTJ', 8: 'I-INTJ', 9: 'B-LST', 10: 'I-LST', 11: 'B-NP', 12: 'I-NP', 13: 'B-PP',
    14: 'I-PP', 15: 'B-PRT', 16: 'I-PRT', 17: 'B-SBAR', 18: 'I-SBAR', 19: 'B-UCP', 20: 'I-UCP',
    21: 'B-VP', 22: 'I-VP'
}

ner_tags_dict = {
    0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC',
    7: 'B-MISC', 8: 'I-MISC'
}

train_df['pos_tags'] = train_df['pos_tags'].apply(lambda tags: [pos_tags_dict[tag] for tag in tags])
train_df['chunk_tags'] = train_df['chunk_tags'].apply(lambda tags: [chunk_tags_dict[tag] for tag in tags])
train_df['ner_tags'] = train_df['ner_tags'].apply(lambda tags: [ner_tags_dict[tag] for tag in tags])

validation_df['pos_tags'] = validation_df['pos_tags'].apply(lambda tags: [pos_tags_dict[tag] for tag in tags])
validation_df['chunk_tags'] = validation_df['chunk_tags'].apply(lambda tags: [chunk_tags_dict[tag] for tag in tags])
validation_df['ner_tags'] = validation_df['ner_tags'].apply(lambda tags: [ner_tags_dict[tag] for tag in tags])

test_df['pos_tags'] = test_df['pos_tags'].apply(lambda tags: [pos_tags_dict[tag] for tag in tags])
test_df['chunk_tags'] = test_df['chunk_tags'].apply(lambda tags: [chunk_tags_dict[tag] for tag in tags])
test_df['ner_tags'] = test_df['ner_tags'].apply(lambda tags: [ner_tags_dict[tag] for tag in tags])

In [None]:
train_df.head()


Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]","[B-NP, B-VP, B-NP, I-NP, B-VP, I-VP, B-NP, I-N...","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]"
1,1,"[Peter, Blackburn]","[NNP, NNP]","[B-NP, I-NP]","[B-PER, I-PER]"
2,2,"[BRUSSELS, 1996-08-22]","[NNP, CD]","[B-NP, I-NP]","[B-LOC, O]"
3,3,"[The, European, Commission, said, on, Thursday...","[DT, NNP, NNP, VBD, IN, NNP, PRP, VBD, IN, JJ,...","[B-NP, I-NP, I-NP, B-VP, B-PP, B-NP, B-NP, B-V...","[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O,..."
4,4,"[Germany, 's, representative, to, the, Europea...","[NNP, POS, NN, TO, DT, NNP, NNP, POS, JJ, NN, ...","[B-NP, B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, B-N...","[B-LOC, O, O, O, O, B-ORG, I-ORG, O, O, O, B-P..."


In [None]:
# объединяем данные из validation и test в одну выборку test
test_df = pd.concat([validation_df, test_df], ignore_index=True)

In [None]:
test_df.head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[CRICKET, -, LEICESTERSHIRE, TAKE, OVER, AT, T...","[NNP, :, NNP, NNP, IN, NNP, NNP, NNP, NNP, NN, .]","[B-NP, O, B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, ...","[O, O, B-ORG, O, O, O, O, O, O, O, O]"
1,1,"[LONDON, 1996-08-30]","[NNP, CD]","[B-NP, I-NP]","[B-LOC, O]"
2,2,"[West, Indian, all-rounder, Phil, Simmons, too...","[NNP, NNP, NN, NNP, NNP, VBD, CD, IN, CD, IN, ...","[B-NP, I-NP, I-NP, I-NP, I-NP, B-VP, B-NP, B-P...","[B-MISC, I-MISC, O, B-PER, I-PER, O, O, O, O, ..."
3,3,"[Their, stay, on, top, ,, though, ,, may, be, ...","[PRP$, NN, IN, NN, ,, RB, ,, MD, VB, JJ, IN, N...","[B-NP, I-NP, B-PP, B-NP, O, B-ADVP, O, B-VP, I...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-ORG,..."
4,4,"[After, bowling, Somerset, out, for, 83, on, t...","[IN, VBG, NNP, RP, IN, CD, IN, DT, NN, NN, IN,...","[B-PP, B-NP, I-NP, B-PRT, B-PP, B-NP, B-PP, B-...","[O, O, B-ORG, O, O, O, O, O, O, O, O, B-LOC, I..."


In [None]:
X_train = train_df['tokens'].tolist()
y_train = train_df['pos_tags'].tolist()

X_test = test_df['tokens'].tolist()
y_test = test_df['pos_tags'].tolist()

In [None]:
def prepare_dataset(dataframe):
    dataset = []
    for i in range(len(dataframe)):
        sentence = []
        for j in range(len(dataframe['tokens'][i])):
            d = (dataframe['tokens'][i][j], dataframe['pos_tags'][i][j])
            sentence.append(d)
        dataset.append(sentence)
    return dataset

In [None]:
train_ds = prepare_dataset(train_df)
test_ds = prepare_dataset(test_df)

In [None]:
train_ds[0], test_ds[0]

([('EU', 'NNP'),
  ('rejects', 'VBZ'),
  ('German', 'JJ'),
  ('call', 'NN'),
  ('to', 'TO'),
  ('boycott', 'VB'),
  ('British', 'JJ'),
  ('lamb', 'NN'),
  ('.', '.')],
 [('CRICKET', 'NNP'),
  ('-', ':'),
  ('LEICESTERSHIRE', 'NNP'),
  ('TAKE', 'NNP'),
  ('OVER', 'IN'),
  ('AT', 'NNP'),
  ('TOP', 'NNP'),
  ('AFTER', 'NNP'),
  ('INNINGS', 'NNP'),
  ('VICTORY', 'NN'),
  ('.', '.')])

### Расшифровка пос тегов


In [None]:
": Quote (opening)
"'': Quote (closing)
#: Hash
$: Dollar
(: Left Parenthesis
): Right Parenthesis
,: Comma
.: Period
:: Colon
``: Quote (opening, alternate)
CC: Coordinating Conjunction
CD: Cardinal Number
DT: Determiner
EX: Existential There
FW: Foreign Word
IN: Preposition or Subordinating Conjunction
JJ: Adjective or Ordinal Number
JJR: Adjective, comparative
JJS: Adjective, superlative
LS: List Item Marker
MD: Modal
NN: Noun, singular or mass
NNP: Proper Noun, singular
NNPS: Proper Noun, plural
NNS: Noun, plural
NN|SYM: Noun or Symbol
PDT: Predeterminer
POS: Possessive Ending
PRP: Personal Pronoun
PRP$: Possessive Pronoun
RB: Adverb
RBR: Adverb, comparative
RBS: Adverb, superlative
RP: Particle
SYM: Symbol
TO: "to" as preposition or infinitive marker
UH: Interjection
VB: Verb, base form
VBD: Verb, past tense
VBG: Verb, gerund or present participle
VBN: Verb, past participle
VBP: Verb, non-3rd person singular present
VBZ: Verb, 3rd person singular present
WDT: Wh-determiner
WP: Wh-pronoun
WP$: Possessive wh-pronoun
WRB: Wh-adverb






## Определяем признаки

In [None]:
# попробуем разные наборы параметров
# m1 – базовые признаки слова, без префиксов и суффиксов
# m2 – базовые признаки слова, а также префиксы и суффиксы
# m3 – базовые признаки слова, префиксы и суффиксы, а также дополнительная информация о предыдущих и последующих словах

def word2features(sent, i, m):
    word = sent[i][0]
    if m == 2: # с префиксами и суффиксами
        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:], # последние 3 буквы слова
            'word[-2:]': word[-2:], # последние 2 буквы
            'word[+2:]': word[2:], # следующие 2 буквы после слова
            'word[+3:]': word[3:],  # следующие 3 буквы после слова
            'word.isupper()': word.isupper(), # все буквы заглавные
            'word.istitle()': word.istitle(), # начинается ли с заглавной
            'word.isdigit()': word.isdigit(), # состоит ли из цифр
        }
    elif m == 3: # с префиксами и суффиксами, и информацией о предыдущих и следующих словах
        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word[+2:]': word[2:],
            'word[+3:]': word[3:],
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
        }
        if i > 0:
            word1, postag1 = sent[i-1]
            features.update({
                '-1:word.lower()': word1.lower(), # предыдущее слово
                '-1:word.istitle()': word1.istitle(),  # предыдущее слово начинается ли с заглавной
                '-1:word.isupper()': word1.isupper(), # состоит из заглавных
                '-1:postag': postag1, # постег предыдущего слова
                '-1:postag[:2]': postag1[:2],# первые два символа тега части речи слова слева
            })
        else:
            features['BOS'] = True

        if i < len(sent)-1:
            word1, postag1 = sent[i+1]
            features.update({
                '+1:word.lower()': word1.lower(), # следующее слово
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:postag': postag1,
                '+1:postag[:2]': postag1[:2],
            })
        else:
            features['EOS'] = True
    else:
        features = { # бэйзалайн
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
        }

    return features


def sent2features(sent, m):
    return [word2features(sent, i, m) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

## Обучаем CRF

In [None]:
model1 = sklearn_crfsuite.CRF(

    algorithm = 'l2sgd',
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions = True
)

model2 = sklearn_crfsuite.CRF(

    algorithm = 'l2sgd',
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions = True
)

model3 = sklearn_crfsuite.CRF(

    algorithm = 'l2sgd',
    c2 = 0.1,
    max_iterations = 100,
    all_possible_transitions = True
)

In [None]:
%%time
X_train_m1 = [sent2features(s, m = 1) for s in train_ds]
X_train_m2 = [sent2features(s, m = 2) for s in train_ds]
X_train_m3 = [sent2features(s, m = 3) for s in train_ds]
y_train = [sent2labels(s) for s in train_ds]

X_test_m1 = [sent2features(s, 1) for s in test_ds]
X_test_m2 = [sent2features(s, 2) for s in test_ds]
X_test_m3 = [sent2features(s, 3) for s in test_ds]
y_test = [sent2labels(s) for s in test_ds]

CPU times: user 1.6 s, sys: 192 ms, total: 1.79 s
Wall time: 1.79 s


In [None]:
X_train_m3[0]

[{'bias': 1.0,
  'word.lower()': 'eu',
  'word[-3:]': 'EU',
  'word[-2:]': 'EU',
  'word[+2:]': '',
  'word[+3:]': '',
  'word.isupper()': True,
  'word.istitle()': False,
  'word.isdigit()': False,
  'BOS': True,
  '+1:word.lower()': 'rejects',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'VBZ',
  '+1:postag[:2]': 'VB'},
 {'bias': 1.0,
  'word.lower()': 'rejects',
  'word[-3:]': 'cts',
  'word[-2:]': 'ts',
  'word[+2:]': 'jects',
  'word[+3:]': 'ects',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  '-1:word.lower()': 'eu',
  '-1:word.istitle()': False,
  '-1:word.isupper()': True,
  '-1:postag': 'NNP',
  '-1:postag[:2]': 'NN',
  '+1:word.lower()': 'german',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:postag': 'JJ',
  '+1:postag[:2]': 'JJ'},
 {'bias': 1.0,
  'word.lower()': 'german',
  'word[-3:]': 'man',
  'word[-2:]': 'an',
  'word[+2:]': 'rman',
  'word[+3:]': 'man',
  'word.isupper()': False,
  '

In [None]:
from tqdm import tqdm

def train_model_with_progress(model, X_train, y_train, subset_size=None):
    if subset_size is not None:
        X_train_subset = X_train[:subset_size]
        y_train_subset = y_train[:subset_size]
    else:
        X_train_subset = X_train
        y_train_subset = y_train

    for iteration in tqdm(range(1, model.max_iterations + 1)):
        model.fit(X_train_subset, y_train_subset)

In [None]:
train_model_with_progress(model1, X_train_m1, y_train, subset_size=1000)

100%|██████████| 100/100 [16:04<00:00,  9.65s/it]


In [None]:
train_model_with_progress(model2, X_train_m2, y_train, subset_size=1000)

100%|██████████| 100/100 [16:33<00:00,  9.93s/it]


In [None]:
train_model_with_progress(model3, X_train_m3, y_train, subset_size=1000)

100%|██████████| 100/100 [20:11<00:00, 12.12s/it]


In [None]:
labels1 = list(model1.classes_)
labels2 = list(model2.classes_)
labels3 = list(model3.classes_)

In [None]:
y_pred1 = model1.predict(X_test_m1)
f1_1 = metrics.flat_f1_score(y_test, y_pred1,
                      average='weighted', labels=labels1)

y_pred2 = model2.predict(X_test_m2)
f1_2 = metrics.flat_f1_score(y_test, y_pred2,
                      average='weighted', labels=labels2)

y_pred3 = model3.predict(X_test_m3)
f1_3 = metrics.flat_f1_score(y_test, y_pred3,
                      average='weighted', labels=labels3)
print(f1_1, f1_2, f1_3)

0.8440513801639149 0.8955385730864966 0.8977554747476696


## Матрица метрик модели 1

In [None]:
from sklearn.metrics import classification_report

y_test_flat = [label for sublist in y_test for label in sublist]
y_pred1_flat = [label for sublist in y_pred1 for label in sublist]

print(classification_report(y_test_flat, y_pred1_flat, labels=labels1))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         NNP       0.85      0.96      0.90     17153
         VBZ       0.95      0.61      0.74      1013
          JJ       0.65      0.60      0.62      5436
          NN       0.64      0.80      0.71     10957
          TO       1.00      0.99      1.00      1724
          VB       0.80      0.77      0.78      2052
           .       1.00      1.00      1.00      3509
          CD       0.87      0.93      0.90     10258
          DT       0.99      0.98      0.99      6320
         VBD       0.77      0.82      0.79      3929
          IN       0.96      0.97      0.97      8995
         PRP       1.00      0.89      0.94      1467
         NNS       0.83      0.45      0.58      4677
         VBP       0.84      0.58      0.69       696
          MD       0.93      0.90      0.91       568
         VBN       0.58      0.54      0.56      1859
         POS       0.96      0.99      0.97       770
         JJR       0.52    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
def predict_pos_tags(sentence, model):
    tokens = sentence.split()
    features = [sent2features([(token, '')], m=3)[0] for token in tokens]
    predicted_labels = model.predict([features])
    return list(zip(tokens, predicted_labels[0]))

In [None]:
test_sentence = "I will present you a present"
predicted_tags = predict_pos_tags(test_sentence, model1)
for token, label in predicted_tags:
    print(f"{token}: {label}")

I: PRP
will: MD
present: VB
you: VBN
a: DT
present: NN


In [None]:
test_sentence = "The wind is too strong to wind the sail"
predicted_tags = predict_pos_tags(test_sentence, model1)
for token, label in predicted_tags:
    print(f"{token}: {label}")

The: DT
wind: NN
is: VBZ
too: RB
strong: JJ
to: TO
wind: VB
the: DT
sail: NN


In [None]:
test_sentence = "I want to book a book"
predicted_tags = predict_pos_tags(test_sentence, model1)
for token, label in predicted_tags:
    print(f"{token}: {label}")


I: PRP
want: VB
to: TO
book: VB
a: DT
book: NN


## Матрица метрик модели 2

In [None]:
y_pred2_flat = [label for sublist in y_pred2 for label in sublist]

print(classification_report(y_test_flat, y_pred2_flat, labels=labels2))


              precision    recall  f1-score   support

         NNP       0.88      0.95      0.91     17153
         VBZ       0.89      0.68      0.77      1013
          JJ       0.73      0.66      0.69      5436
          NN       0.80      0.83      0.81     10957
          TO       1.00      0.99      1.00      1724
          VB       0.82      0.77      0.79      2052
           .       1.00      1.00      1.00      3509
          CD       0.94      0.96      0.95     10258
          DT       0.98      0.98      0.98      6320
         VBD       0.87      0.89      0.88      3929
          IN       0.95      0.97      0.96      8995
         PRP       0.99      0.91      0.95      1467
         NNS       0.91      0.88      0.89      4677
         VBP       0.81      0.61      0.69       696
          MD       0.96      0.89      0.93       568
         VBN       0.76      0.73      0.75      1859
         POS       0.96      0.98      0.97       770
         JJR       0.60    

In [None]:
test_sentence = "I will present you a present"
predicted_tags = predict_pos_tags(test_sentence, model2)
for token, label in predicted_tags:
    print(f"{token}: {label}")

I: PRP
will: MD
present: VB
you: IN
a: DT
present: NN


In [None]:
test_sentence = "The wind is too strong to wind the sail"
predicted_tags = predict_pos_tags(test_sentence, model2)
for token, label in predicted_tags:
    print(f"{token}: {label}")

The: DT
wind: NN
is: VBZ
too: RB
strong: JJ
to: TO
wind: VB
the: DT
sail: NN


In [None]:
test_sentence = "I want to book a book"
predicted_tags = predict_pos_tags(test_sentence, model2)
for token, label in predicted_tags:
    print(f"{token}: {label}")


I: PRP
want: VB
to: TO
book: VB
a: DT
book: NN


## Матрица метрик модели 3

In [None]:
y_pred3_flat = [label for sublist in y_pred3 for label in sublist]

print(classification_report(y_test_flat, y_pred3_flat, labels=labels3))

              precision    recall  f1-score   support

         NNP       0.87      0.96      0.92     17153
         VBZ       0.93      0.69      0.79      1013
          JJ       0.74      0.67      0.70      5436
          NN       0.81      0.84      0.82     10957
          TO       1.00      0.99      1.00      1724
          VB       0.84      0.79      0.82      2052
           .       1.00      1.00      1.00      3509
          CD       0.94      0.95      0.95     10258
          DT       0.98      0.98      0.98      6320
         VBD       0.86      0.91      0.88      3929
          IN       0.95      0.97      0.96      8995
         PRP       0.99      0.91      0.95      1467
         NNS       0.91      0.89      0.90      4677
         VBP       0.83      0.66      0.73       696
          MD       0.98      0.92      0.95       568
         VBN       0.79      0.73      0.76      1859
         POS       0.94      0.98      0.96       770
         JJR       0.62    

In [None]:
test_sentence = "The wind is too strong to wind the sail"
predicted_tags = predict_pos_tags(test_sentence, model3)
for token, label in predicted_tags:
    print(f"{token}: {label}")

The: DT
wind: NN
is: VBZ
too: CD
strong: NN
to: TO
wind: VB
the: DT
sail: NN


## Выводы:

У последней model3, где больше всего параметров, лучший ф1 скор (0.8977554747476696)

Омонимия снимается (проверила на предложениях to wind – wind, to book – a book, to present – a present), но есть ошибки из-за того, что модели обучались на небольшой выборке из датасета (1000 предложений)