In [1]:
# importing required libraries
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
import ru_core_news_md
from tqdm import tqdm
import sklearn_crfsuite
from sklearn_crfsuite import metrics


In [3]:
!python -m spacy download ru_core_news_md

Collecting ru-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.7.0/ru_core_news_md-3.7.0-py3-none-any.whl (41.9 MB)
     ---------------------------------------- 0.0/41.9 MB ? eta -:--:--
     --------------------------------------- 0.0/41.9 MB 991.0 kB/s eta 0:00:43
     --------------------------------------- 0.1/41.9 MB 919.0 kB/s eta 0:00:46
     ---------------------------------------- 0.2/41.9 MB 1.1 MB/s eta 0:00:37
     ---------------------------------------- 0.3/41.9 MB 1.8 MB/s eta 0:00:24
      --------------------------------------- 0.6/41.9 MB 2.5 MB/s eta 0:00:17
      --------------------------------------- 1.0/41.9 MB 3.7 MB/s eta 0:00:12
     - -------------------------------------- 1.3/41.9 MB 4.0 MB/s eta 0:00:11
     - -------------------------------------- 1.7/41.9 MB 4.5 MB/s eta 0:00:10
     -- ------------------------------------- 2.7/41.9 MB 6.3 MB/s eta 0:00:07
     --- ---------------------------

In [6]:
# Loading the dataset
jsonObj = pd.read_json(path_or_buf='train.jsonl', lines=True)
data = jsonObj

In [7]:
# Load the pre-trained Russian spaCy model
nlp = ru_core_news_md.load()

# Function to tokenize text and extract features
def sent2features(sent):
    doc = nlp(sent)
    return [token2features(doc, i) for i in range(len(doc))]

# Feature extraction
def token2features(doc, i):
    token = doc[i]
    features = {
        "bias": 1.0,
        "word.lower()": token.text.lower(),
        "word[-3:]": token.text[-3:],
        "word[-2:]": token.text[-2:],
        "word.isupper()": token.text.isupper(),
        "word.istitle()": token.text.istitle(),
        "word.isdigit()": token.text.isdigit(),
        "postag": token.pos_,
        "postag[:2]": token.pos_[:2],
    }
    # adding features for the previous token (if exists)
    if i > 0:
        prev_token = doc[i - 1]
        features.update({
            "-1:word.lower()": prev_token.text.lower(),
            "-1:word.istitle()": prev_token.text.istitle(),
            "-1:word.isupper()": prev_token.text.isupper(),
            "-1:postag": prev_token.pos_,
            "-1:postag[:2]": prev_token.pos_[:2],
        })
    else:
        features["BOS"] = True # indicator of the beginning of sentence

    # adding features for the next token (if exists)
    if i < len(doc) - 1:
        next_token = doc[i + 1]
        features.update({
            "+1:word.lower()": next_token.text.lower(),
            "+1:word.istitle()": next_token.text.istitle(),
            "+1:word.isupper()": next_token.text.isupper(),
            "+1:postag": next_token.pos_,
            "+1:postag[:2]": next_token.pos_[:2],
        })
    else:
        features["EOS"] = True # indicator of the ending of sentence

    return features

In [9]:
# Generating features and labels
labels_ = []
features = []
for idx, row in tqdm(data.iterrows(),total=data.shape[0]):
  sentence = row.sentences
  entities = row.ners
  doc = nlp(sentence)
  tokens = []
  labels = ['O'] * len(doc) # default label
  for start, end, label in entities:
      for token in doc:
          if token.idx == start:
              labels[token.i] = 'B-' + label # beginning of entity
          elif start < token.idx < end:
              labels[token.i] = 'I-' + label # inside of entity
  features.append(sent2features(sentence))
  labels_.append(labels)

100%|████████████████████████████████████████████████████████████████████████████████| 519/519 [02:11<00:00,  3.96it/s]


In [10]:
# Create a df for features and labels
dataset = pd.DataFrame({'features': features, 'labels': labels_})
X = dataset['features']
y = dataset['labels']

In [12]:
# Splitting df into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]

In [29]:
# configuration of CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# Evaluate of the performance
y_pred = crf.predict(X_test)
print(metrics.flat_f1_score(y_test, y_pred, average='weighted'))

0.8431432565708364


In [30]:
# Re-train the model on the whole dataset
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X, y)

In [38]:
# prediction on new test data (for submission)
testObj = pd.read_json(path_or_buf='test_x.jsonl', lines=True)
features_test = []
for idx, row in tqdm(testObj.iterrows(),total=testObj.shape[0]):
  sentence = row.senences
  features_test.append(sent2features(sentence))
X_pred = pd.DataFrame({'features': features_test})['features']
y_pred = crf.predict(X_pred)

100%|██████████████████████████████████████████████████████████████████████████████████| 65/65 [00:12<00:00,  5.11it/s]


In [41]:
# constructing the final submission
answers= []
for i in range(len(y_pred)):
  answer = []
  for token, entity in zip(nlp(testObj.iloc[i].senences),y_pred[i]):
    if entity[0]=='B':
      i = token.idx
      j = i + len(token.text) -1
      answer.append([i,j,entity[2:]])
    elif entity[0]=='I':
      if len(answer)!=0:
        j += len(token.text)+1
        temp = answer[-1]
        temp[1] = j
        answer[-1] = temp
      else:
        i = token.idx
        j = i + len(token.text)-1
        answer.append([i,j,entity[2:]])
  answers.append(answer)

In [42]:
# Saving the submission
submission = pd.DataFrame({'ners':answers, 'id':testObj.id})
submission.to_json('test.jsonl',orient='records',lines=True)