In [1]:
import os

os.environ['MLFLOW_EXPERIMENT_NAME'] = "LingBizkit_experiment"
os.environ['MLFLOW_TRACKING_URI'] = "http://79.137.194.156:5000/"

In [2]:
from transformers import set_seed

set_seed(228)



In [3]:
import pandas as pd

dataset_df = pd.read_csv('preproc_dataset.csv')
#dataset_df

In [4]:
dataset_df.text = dataset_df.text.map(lambda x: x[:768])

In [5]:
id2label = {k: v for k, v in enumerate(dataset_df.theme.unique())}
label2id = {v: k for k, v in id2label.items()}

In [6]:
from datasets import Dataset

temp_dataset_df = dataset_df.rename({'theme': 'labels'}, axis=1)
temp_dataset_df.labels = temp_dataset_df.labels.map(lambda x: label2id[x])
train_dataset = Dataset.from_pandas(temp_dataset_df[temp_dataset_df.sample_type == 'train'][['text', 'labels']])
test_dataset = Dataset.from_pandas(temp_dataset_df[temp_dataset_df.sample_type == 'test'][['text', 'labels']])

In [7]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

### Batch inference

In [8]:
import torch
from tqdm.notebook import tqdm

BATCH_SIZE = 8

def batch_iter(iterable, n=1):
    l = len(iterable)
    it = iter(iterable)
    temp = []

    for ndx in range(0, l):
        temp.append(next(it))
        if len(temp) == n:
            yield temp.copy()
            temp.clear()

    if temp:
        yield temp

def get_logits(model_path):
    train_logits = []
    test_logits = []
    all_trues = []
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.to('cuda')

    model.eval()
    with torch.no_grad():
        for batch in tqdm(batch_iter(train_dataset, BATCH_SIZE), total=len(train_dataset) // BATCH_SIZE):
            texts = [b['text'] for b in batch]
            tokenized = tokenizer(texts, return_tensors='pt', truncation=True, padding=True).to('cuda')
            batch_logits = model(**tokenized).logits.cpu()
            # batch_pred_ids = batch_logits.argmax(axis=1).numpy()
            # pred_labels = [model.config.id2label[label_id] for label_id in batch_pred_ids]
            train_logits.extend(batch_logits.numpy())
            # all_trues.extend([id2label[b['labels']] for b in batch])

        for batch in tqdm(batch_iter(test_dataset, BATCH_SIZE), total=len(test_dataset) // BATCH_SIZE):
            texts = [b['text'] for b in batch]
            tokenized = tokenizer(texts, return_tensors='pt', truncation=True, padding=True).to('cuda')
            batch_logits = model(**tokenized).logits.cpu()
            # batch_pred_ids = batch_logits.argmax(axis=1).numpy()
            # pred_labels = [model.config.id2label[label_id] for label_id in batch_pred_ids]
            test_logits.extend(batch_logits.numpy())
            #all_trues.extend([id2label[b['labels']] for b in batch])
    return train_logits, test_logits

In [9]:
y_train = [id2label[lab] for lab in train_dataset['labels']]
y_test = [id2label[lab] for lab in test_dataset['labels']]

In [10]:
train_logits_LaBSE, test_logits_LaBSE = get_logits('LaBSE-en-ru_checkpoint-6795_group_theme')

  0%|          | 0/2264 [00:00<?, ?it/s]

  0%|          | 0/564 [00:00<?, ?it/s]

In [11]:
train_logits_LaBSE_theme, test_logits_LaBSE_theme = get_logits('LaBSE-er-ru_checkpoint-5500_theme')

  0%|          | 0/2264 [00:00<?, ?it/s]

  0%|          | 0/564 [00:00<?, ?it/s]

In [12]:
train_logits_LaBSE_executor, test_logits_LaBSE_executor = get_logits('cointegrated/LaBSE-en-ru_executor/checkpoint-1500')

  0%|          | 0/2264 [00:00<?, ?it/s]

  0%|          | 0/564 [00:00<?, ?it/s]

In [13]:
X_train_1 = pd.DataFrame({
    'text': train_dataset['text'],
    'logits': train_logits_LaBSE
})

X_test_1 = pd.DataFrame({
    'text': test_dataset['text'],
    'logits': test_logits_LaBSE
})

X_train_2 = pd.DataFrame({
    'text': train_dataset['text'],
    'logits': train_logits_LaBSE_theme
})

X_test_2 = pd.DataFrame({
    'text': test_dataset['text'],
    'logits': test_logits_LaBSE_theme
})


X_train_3 = pd.DataFrame({
    'text': train_dataset['text'],
    'logits': train_logits_LaBSE_executor
})

X_test_3 = pd.DataFrame({
    'text': test_dataset['text'],
    'logits': test_logits_LaBSE_executor
})


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from scipy.sparse import hstack as sparse_hstack
from torch.nn.functional import softmax

tfidf = TfidfVectorizer()
tfidf_features = tfidf.fit_transform(X_train_1.text)

combined_features = sparse_hstack([tfidf_features, 
                                   softmax(torch.tensor(X_train_1.logits.to_list()), dim=1).numpy(),
                                   softmax(torch.tensor(X_train_2.logits.to_list()), dim=1).numpy(),
                                   #softmax(torch.tensor(X_train_3.logits.to_list()), dim=1).numpy()
                                  ])

classifier = SGDClassifier(loss='modified_huber', random_state=42)
classifier.fit(combined_features, y_train)

import pickle
with open('final_tf_idf.pkl', 'wb') as fout:
    pickle.dump(tfidf, fout)
    
with open('final_model.pkl', 'wb') as fout:
    pickle.dump(classifier, fout)

  softmax(torch.tensor(X_train_1.logits.to_list()), dim=1).numpy(),


In [15]:
tfidf_test = tfidf.transform(X_test_1.text)

test_features = sparse_hstack([tfidf_test, 
                               softmax(torch.tensor(X_test_1.logits.to_list()), dim=1).numpy(),
                               softmax(torch.tensor(X_test_2.logits.to_list()), dim=1).numpy(),
                               #softmax(torch.tensor(X_test_3.logits.to_list()), dim=1).numpy(),
                              ])
y_pred = classifier.predict(test_features)


In [16]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average='weighted')

0.5457579947389618

In [17]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred, average='weighted')

0.5457579947389618