In [None]:
!pip install transformers datasets evaluate

In [None]:
!pip install transformers[torch]

In [4]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
import string

In [5]:
df = pd.read_excel('/kaggle/input/dialogs/dialogs_last.xlsx')

In [6]:
#label_name - название основной метки, replics_column_name - доп метки с разметкой по репликам
label_name = 'is_negative'
replics_column_name = 'negative_replics'

In [None]:
df[label_name].value_counts()

In [8]:
df = df[df[label_name].isin([0, 1])].reset_index(drop=True)

In [9]:
df[label_name] = df[label_name].astype('int')

Разбить текст диалога на реплики клиента

In [11]:
def get_client_replics_list(dialog_text):
    replics = dialog_text.replace('Customer:', '').split('\n\t')
    replics = [replic for replic in replics if not 'Agent:' in replic]
    replics = [replic.strip() for replic in replics]
    return replics

In [12]:
def get_label_replics_list(label_replics_str):
    if type(label_replics_str) != str:
        return []
    label_replics_list = [replic.strip() for replic in label_replics_str.split('\n\t;')]
    return label_replics_list

In [13]:
def make_replics_df(df, replics_column_name, label_name):
    replics_df = df[['text', replics_column_name]]
    replics_df[replics_column_name] = replics_df[replics_column_name].\
        map(get_label_replics_list)
    replics_df['text'] = replics_df['text'].map(get_client_replics_list)
    neg_replics = replics_df[replics_column_name].reset_index().explode(replics_column_name).dropna()
    replics_df = replics_df.explode('text').reset_index().rename({'index': 'dialog_id'}, axis=1)
    replics_df[label_name] = replics_df.apply(lambda x: x['text'] in x[replics_column_name], axis=1)
    neg_replics['is_bad_replic'] = neg_replics[replics_column_name].apply(lambda x: x not in replics_df['text'].values and x != '')
    bad_indices = neg_replics[neg_replics.groupby('index')['is_bad_replic'].sum() > 0]['index'].unique()
    replics_df = replics_df[~replics_df.index.isin(bad_indices)][['dialog_id', 'text', label_name]].reset_index(drop=True)
    replics_df[label_name] = replics_df[label_name].astype('int')
    return replics_df

### Предобработка

In [16]:
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_nicknames(text):
    clean = re.compile('@\w+')
    return re.sub(clean, '', text)

def remove_urls(text):
    clean = re.compile(r'http\S+|www\S+')
    return re.sub(clean, '', text)

def clean_text(text):
    clean = re.compile(r'[-,:]')
    return re.sub(clean, '', text)

In [17]:
train_replics_df['text'] = train_replics_df['text'].apply(remove_html_tags).apply(remove_nicknames).apply(remove_urls)
eval_replics_df['text'] = eval_replics_df['text'].apply(remove_html_tags).apply(remove_nicknames).apply(remove_urls)
test_replics_df['text'] = test_replics_df['text'].apply(remove_html_tags).apply(remove_nicknames).apply(remove_urls)

In [18]:
train_replics_df

Unnamed: 0,dialog_id,text,is_negative
0,0,for some reason in the blackjacks gun game mo...,0
1,0,It was wired actually. I double checked every...,0
2,0,I’ll try hopping in a game right now and see!...,0
3,0,My last match went flawlessly. Thanks again!,0
4,1,What is going on with the website .... litera...,1
...,...,...,...
4223,765,u guys all suck turned my laptop off last n...,1
4224,765,Because of ur shit update so now I'm suppos...,1
4225,765,Oh almost didn't mention I've got auto upda...,0
4226,765,Yeh but it was fine when I turned it off last...,0


### Модель

In [43]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

tokenizer_name = 'bert-base-uncased'
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [44]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [45]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
import torch
torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [47]:
class ClassificationTorchDataset(Dataset):

    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        model_inputs = tokenizer(
            self.df.iloc[idx, :]['text'], max_length=512,
            truncation=True
        )
        model_inputs['label'] = self.df.iloc[idx, :][label_name]
        return model_inputs

In [48]:
train_replics_dataset = ClassificationTorchDataset(train_replics_df)
eval_replics_dataset = ClassificationTorchDataset(eval_replics_df)

In [49]:
import evaluate

f1 = evaluate.load("f1")

In [50]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="model_dir",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=20,
    metric_for_best_model='f1',
    greater_is_better=True,
    push_to_hub=False,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_replics_dataset,
    eval_dataset=eval_replics_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits
    return np.array(logits.argmax(axis=1).cpu())

In [None]:
test_replics_df['text']

0       So neither my iPhone nor my Apple Watch are re...
1                                  please read the above.
2         My iPhone is on 11.1.2, and my watch is on 4.1.
3        I’ve restarted both, also un-paired then re-p...
4        Yes, everything seems fine, it’s just Health ...
                              ...                        
1193     may have been giving out the pre-order copies...
1194     but I would just like what I was sold and wha...
1195     correctly. I was told by staff in store that ...
1196     supply them to me that way? It would be nice ...
1197                            what is fair and correct.
Name: text, Length: 1198, dtype: object

### Cross-validation:

In [53]:
training_args = TrainingArguments(
    output_dir="model_dir",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=15,
    metric_for_best_model='f1',
    greater_is_better=True,
    push_to_hub=False,
    load_best_model_at_end=True,
    report_to='none'
)

In [58]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import evaluate

n = 5
skf = StratifiedKFold(n_splits=n, random_state=42, shuffle=True)
f1_score_results = []

for train_index, test_index in skf.split(df, df[label_name]):

        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
        model.to(device)

        f1 = evaluate.load("f1")

        test_df = df.iloc[test_index]
        train_df, eval_df = train_test_split(df.iloc[train_index], test_size=0.25, random_state=42)

        train_df = train_df.reset_index(drop=True)
        eval_df = eval_df.reset_index(drop=True)
        test_df = test_df.reset_index(drop=True)

        train_replics_df = make_replics_df(train_df, replics_column_name=replics_column_name, label_name=label_name)
        eval_replics_df = make_replics_df(eval_df, replics_column_name=replics_column_name, label_name=label_name)
        test_replics_df = make_replics_df(test_df, replics_column_name=replics_column_name, label_name=label_name)

        train_replics_df['text'] = train_replics_df['text'].apply(remove_html_tags).apply(remove_nicknames).apply(remove_urls)
        eval_replics_df['text'] = eval_replics_df['text'].apply(remove_html_tags).apply(remove_nicknames).apply(remove_urls)
        test_replics_df['text'] = test_replics_df['text'].apply(remove_html_tags).apply(remove_nicknames).apply(remove_urls)

        train_replics_dataset = ClassificationTorchDataset(train_replics_df)
        eval_replics_dataset = ClassificationTorchDataset(eval_replics_df)

        trainer = Trainer(
          model=model,
          args=training_args,
          train_dataset=train_replics_dataset,
          eval_dataset=eval_replics_dataset,
          tokenizer=tokenizer,
          data_collator=data_collator,
          compute_metrics=compute_metrics
          )

        trainer.train()

        !rm -r model_dir

        labels = test_replics_df[label_name].values.astype('int')
        predictions = predict(test_replics_df['text'].values.tolist())
        
        test_replics_df['prediction'] = predictions
        
        dialog_predictions = test_replics_df.groupby('dialog_id')['prediction'].max()
        dialog_labels = test_replics_df.groupby('dialog_id')[label_name].max()

        precision_only, recall_only, f1_only, support_only = precision_recall_fscore_support(dialog_labels, dialog_predictions)

        f1_score_results.append(f1_only[1])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  replics_df[replics_column_name] = replics_df[replics_column_name].\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  replics_df['text'] = replics_df['text'].map(get_client_replics_list)
  bad_indices = neg_replics[neg_replics.

Epoch,Training Loss,Validation Loss,F1
1,No log,0.353802,0.0
2,No log,0.322633,0.0
3,No log,0.293494,0.0
4,No log,0.26647,0.0
5,0.337400,0.24709,0.0
6,0.337400,0.23164,0.073529
7,0.337400,0.22141,0.314465
8,0.337400,0.21406,0.450549
9,0.224700,0.208492,0.567164
10,0.224700,0.2039,0.59633


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  replics_df[replics_column_name] = replics_df[replics_column_name].\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

Epoch,Training Loss,Validation Loss,F1
1,No log,0.348653,0.0
2,No log,0.318738,0.0
3,No log,0.290153,0.0
4,No log,0.264617,0.0
5,0.334700,0.241955,0.0
6,0.334700,0.225488,0.0
7,0.334700,0.213947,0.106061
8,0.334700,0.20565,0.302632
9,0.334700,0.19876,0.453488
10,0.222100,0.193557,0.554974


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  replics_df[replics_column_name] = replics_df[replics_column_name].\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

Epoch,Training Loss,Validation Loss,F1
1,No log,0.349551,0.0
2,No log,0.318643,0.0
3,No log,0.288261,0.0
4,No log,0.256674,0.0
5,0.338800,0.232686,0.0
6,0.338800,0.214976,0.088235
7,0.338800,0.202919,0.303797
8,0.338800,0.193444,0.464088
9,0.226600,0.187169,0.517766
10,0.226600,0.182709,0.565854


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  replics_df[replics_column_name] = replics_df[replics_column_name].\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

Epoch,Training Loss,Validation Loss,F1
1,No log,0.353456,0.0
2,No log,0.321315,0.0
3,No log,0.290974,0.0
4,No log,0.263004,0.0
5,0.327000,0.241906,0.0
6,0.327000,0.227228,0.015625
7,0.327000,0.21742,0.33121
8,0.327000,0.209917,0.540541
9,0.327000,0.206675,0.591837
10,0.213300,0.202446,0.599034


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  replics_df[replics_column_name] = replics_df[replics_column_name].\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

Epoch,Training Loss,Validation Loss,F1
1,No log,0.381377,0.0
2,No log,0.351461,0.0
3,No log,0.319986,0.0
4,No log,0.286078,0.0
5,0.329800,0.261255,0.0
6,0.329800,0.244315,0.039735
7,0.329800,0.233733,0.275862
8,0.329800,0.223841,0.424242
9,0.214400,0.222294,0.488038
10,0.214400,0.216024,0.566372


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [56]:
!rm -r model_dir

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [59]:
f1_score_results

[0.676923076923077,
 0.7559055118110236,
 0.7299270072992701,
 0.7647058823529411,
 0.6865671641791046]

### Итоговый результат

In [60]:
sum(f1_score_results) / len(f1_score_results)

0.7228057285130833