In [None]:
pip install transformers

In [2]:
import numpy as np
import pandas as pd
import torch

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import transformers as ppb

In [4]:
df = pd.DataFrame(columns = ['sentence','class'])
df

Unnamed: 0,sentence,class


In [5]:
with open('/content/drive/MyDrive/data_chatbot.txt', 'r') as f:
    lst = []
    num=0
    for line in f:
        if line=='\n':
            lst = list(set(lst))
            df1 = pd.DataFrame({'sentence':lst,'class':[num]*len(lst)})
            df = pd.concat([df, df1]).reset_index(drop = True)
            lst=[]
            num+=1
        else:
            line = line.lower()
            lst.append(line[:-1])
df1 = pd.DataFrame({'sentence':lst,'class':[num]*len(lst)})
df = pd.concat([df, df1]).reset_index(drop = True)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,sentence,class
0,какие условия по кредитам на покупку машины?,3
1,что есть в наличии?,4
2,какое количество функций ты в себя включаешь?,1
3,какие машины предлагаете?,4
4,каков функционал вашей компании?,2


In [6]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'DeepPavlov/rubert-base-cased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [7]:
tokenized = df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized.head()

0    [101, 19201, 16132, 1516, 52189, 1469, 32170, ...
1              [101, 1997, 6818, 845, 25097, 166, 102]
2    [101, 25206, 9595, 15535, 4609, 845, 6678, 568...
3            [101, 19201, 14798, 19462, 842, 166, 102]
4    [101, 2739, 1388, 117086, 90754, 852, 6597, 16...
Name: sentence, dtype: object

In [8]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(262, 16)

In [9]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(262, 16)

In [10]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [11]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df['class']
labels=labels.astype('int')

In [23]:
x_train, x_test, y_train, y_test = train_test_split(features, labels,test_size=0.2)
lr_clf = LogisticRegression(C=1,multi_class='ovr',max_iter=300,random_state=41)
lr_clf.fit(x_train, y_train)
lr_clf.score(x_test, y_test)

0.9622641509433962

In [None]:
from xgboost import XGBClassifier

params = {'n_estimators': 500,
 'reg_alpha': 0.05,
 'reg_lambda': 9,
 #'colsample_bytree': 0.3,
 'eval_metric':'merror',
 'subsample': 0.8,
 'learning_rate': 0.1,
 'max_depth': 5,
 'num_leaves': 20,
 'random_state': 42,
 'min_child_samples': 47,
 'min_data_per_groups': 96,
 'tree_method': "gpu_hist",
 "gpu_id": 0}

model_xgb = XGBClassifier(**params)
model_xgb.fit(x_train,y_train, eval_set=[(x_test,y_test)], verbose=True,early_stopping_rounds=200)
y_pred = model_xgb.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [30]:
y_pred = model_xgb.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.9245283018867925


## Try bert-only classification

In [14]:
import torch
from transformers import BertModel, BertTokenizerFast
from transformers import Trainer, TrainingArguments, AdamW

In [15]:
class CustomBERTModel(torch.nn.Module):
  def __init__(self):
    super(CustomBERTModel, self).__init__()

    self.num_labels = 5
    self.weights = None
    self.bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
    self.dropout = torch.nn.Dropout(0.1)
    self.out = torch.nn.Linear(768, self.num_labels)
    # self.softmax = torch.nn.Softmax(19)

  def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
    res = self.bert(
          input_ids, 
          attention_mask=attention_mask,
          token_type_ids=token_type_ids
    )

    sequence_output = self.dropout(res['pooler_output'])

    logits = self.out(sequence_output)

    outputs = (logits, )

    if labels is not None:
      loss_fct = torch.nn.CrossEntropyLoss(weight=self.weights)
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
      outputs = (loss,) + outputs

    return outputs

In [16]:
tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased')

In [17]:
x_train, x_test, y_train, y_test = train_test_split(df.sentence.tolist(), df['class'].tolist(),test_size=0.2)

train_encodings = tokenizer(x_train, truncation=True, padding=True)
val_encodings = tokenizer(x_test, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [18]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, y_train)
val_dataset = Dataset(val_encodings, y_test)

In [19]:
model = CustomBERTModel()

for param in model.bert.parameters():
  param.requires_grad = False

In [None]:
training_args = TrainingArguments(
    num_train_epochs=20,
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    output_dir=f'tmp/out',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir=f'tmp/logging',
    logging_steps=100,
    # save_steps=100,
    metric_for_best_model="eval_loss", 
    lr_scheduler_type='cosine_with_restarts',
    learning_rate=1e-2
)

model.train()
model.to('cuda:0')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

In [21]:
model.eval()

trainer = Trainer(
    model=model
)

y_pred = trainer.predict(val_dataset).predictions.argmax(axis=1).tolist()
print(accuracy_score(y_test, y_pred))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 53
  Batch size = 8


0.8867924528301887
