In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

  from .autonotebook import tqdm as notebook_tqdm


## загрузка данных

In [2]:
df_train = pd.read_csv("dataset_train.tsv", 
                       sep='\t', 
                       header=None,  
                       names=["text", "label"])
df_test = pd.read_csv("dataset_test.tsv", 
                       sep='\t', 
                       header=None,  
                       names=["text", "label"])

print(df_train.head())
print(df_train.info())

                         text              label
0           мне нужна справка  statement_general
1            оформить справку  statement_general
2               взять справку  statement_general
3        справку как получить  statement_general
4  справку ммф где получаться  statement_general
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13230 entries, 0 to 13229
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    13230 non-null  object
 1   label   13230 non-null  object
dtypes: object(2)
memory usage: 206.8+ KB
None


In [3]:
print(df_train['label'].value_counts())

label
sched_teacher                          1110
sched_for_group                         405
sched_for_group_day                     402
wifi                                    287
status_free                             271
loc_dean_mmf                            264
loc_shop                                237
stat_numb_of_students                   219
nsunet                                  211
student_union_event_guests              203
sched_exam                              195
campus_map                              189
socscholarship                          183
highscholarship                         182
conform                                 182
loc_atm                                 169
loc_pass_office                         168
finassist                               166
site_student_cab                        164
nsunet_cost                             154
travelfinaid_get                        151
enter_university_without_pass           151
dorm_guests               

## выбор модели 

In [5]:
model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine

## генерация эмбеддингов предложений

In [7]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [8]:
def get_sentence_embedding(text, tokenizer, model, device, max_length=64):
    encoded_input = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    embedding = mean_pooling(model_output, encoded_input["attention_mask"])
    return embedding.cpu().numpy()[0]

In [9]:
train_embeddings = []
for text in tqdm(df_train["text"], desc="Train embeddings"):
    emb = get_sentence_embedding(text, tokenizer, model, device)
    train_embeddings.append(emb)

train_embeddings = np.array(train_embeddings)
print("Форма эмбеддингов для train:", train_embeddings.shape)

Train embeddings: 100%|██████████| 13230/13230 [01:01<00:00, 215.48it/s]

Форма эмбеддингов для train: (13230, 768)





In [10]:
test_embeddings = []
for text in tqdm(df_test["text"], desc="Test embeddings"):
    emb = get_sentence_embedding(text, tokenizer, model, device)
    test_embeddings.append(emb)

test_embeddings = np.array(test_embeddings)
print("Форма эмбеддингов для test:", test_embeddings.shape)

Test embeddings: 100%|██████████| 883/883 [00:04<00:00, 214.91it/s]

Форма эмбеддингов для test: (883, 768)





## KNN

In [11]:
knn = KNeighborsClassifier(n_neighbors=5, metric="cosine")
knn.fit(train_embeddings, df_train["label"])

In [12]:
y_pred = []
for emb in tqdm(test_embeddings, desc="Predicting test labels"):
    label = knn.predict([emb])[0]
    y_pred.append(label)

y_test = df_test["label"].values
y_pred = np.array(y_pred)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Predicting test labels: 100%|██████████| 883/883 [00:21<00:00, 40.65it/s]


Accuracy: 0.8720

Classification Report:
                                     precision    recall  f1-score   support

                    advice_learning       1.00      1.00      1.00         2
                        advice_okno       1.00      1.00      1.00         6
                         campus_map       0.79      0.92      0.85        12
                            conform       0.75      1.00      0.86        12
                        cvvr_define       0.50      1.00      0.67         2
                    diploma_red_def       1.00      1.00      1.00         3
            dorm_enter_without_pass       0.75      1.00      0.86         3
                        dorm_guests       1.00      0.89      0.94         9
                        dorm_living       1.00      1.00      1.00         4
                          dorm_pets       1.00      1.00      1.00         6
                         dorm_price       1.00      1.00      1.00         3
                      dormcard_lo


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
