In [12]:
import evaluate 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datasets
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline

# Token Classification for Pozitif Yorum - Negatif Yorum

## Read Data

In [13]:
data = pd.read_json('Data/neg-pos-yorum.jsonl', orient='records', lines=True)

In [14]:
len(data)

664

# Model Path and Tokenizer

In [15]:
model_path = "dbmdz/bert-base-turkish-128k-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_path, max_len=512)

# Data Pre-Processing

In [16]:
def getTokenizedLabels(data):
    labels_list = []
    for text, doccano_label_list in zip(data['text'], data['label']):
        encoding = tokenizer(text, return_offsets_mapping=True)
        labels = [None] * len(encoding["offset_mapping"])
        doccano_label_index = 0
        for index, token_tupple in enumerate(encoding["offset_mapping"]):
            
            doccano_label = doccano_label_list[doccano_label_index]
            if(token_tupple == (0,0)): continue

            if(token_tupple[0] > doccano_label[1] and ((doccano_label_index+1) != len(doccano_label_list))):
                doccano_label_index += 1
                doccano_label = doccano_label_list[doccano_label_index]

            if(doccano_label[0] <= token_tupple[0] <= doccano_label[1]):
                labels[index] = doccano_label[2]
            else:
                labels[index] = 'O'
        labels_list.append(labels)
    return pd.Series(labels_list)

In [17]:
def enumLabels(bert_labels):
    label2id = {None:-100, 'O':0, 'Pozitif Yorum':1, 'Negatif Yorum':2}
    id2label = {v:k for k,v in label2id.items()}
    bert_labels_id = bert_labels.map(lambda x: [label2id[y] for y in x])
    return bert_labels_id

In [18]:
bert_labels = getTokenizedLabels(data)

In [19]:
bert_labels_id = enumLabels(bert_labels)

In [20]:
data['tokens'] = data.text.apply(lambda x: tokenizer(x).tokens())
data['input_ids'] = data.text.apply(lambda x: tokenizer(x).input_ids)
data['attention_mask'] = data.text.apply(lambda x: tokenizer(x).attention_mask)
data['token_type_ids'] = data.text.apply(lambda x: tokenizer(x).token_type_ids)
data['bert_labels']= bert_labels
data['bert_label_ids'] = bert_labels_id
df = data.copy()

In [21]:
data.drop(['id', 'label', 'text', 'date', 'user', 'rt', 'fav', 'followers', 'verified', 'tokens', 'bert_labels'], axis=1, inplace=True)
data.rename(columns={'bert_label_ids':'labels'}, inplace=True)
data.head()

Unnamed: 0,input_ids,attention_mask,token_type_ids,labels
0,"[2, 7, 62873, 98360, 23, 18, 5244, 7, 2158, 18...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[2, 7, 62873, 98360, 74996, 1017, 24, 16, 1056...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[2, 7904, 80300, 17468, 16, 22, 18, 84248, 11,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[2, 9725, 11487, 17468, 47268, 1045, 43459, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[2, 7, 2158, 18551, 1013, 28, 18, 8410, 29, 18...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [22]:
train, test = train_test_split(data, test_size=0.1, random_state=42)
train, val = train_test_split(train, test_size=0.11, random_state=42)
len(train), len(val), len(test)

(531, 66, 67)

In [23]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)
train_dataset = datasets.Dataset.from_pandas(train)
test_dataset = datasets.Dataset.from_pandas(test)
val_dataset = datasets.Dataset.from_pandas(val)

In [24]:
train_dataset 

Dataset({
    features: ['input_ids', 'attention_mask', 'token_type_ids', 'labels'],
    num_rows: 531
})

# Label Names

In [25]:
label_names = ["O", " Pozitif Yorum", " Negatif Yorum"]

In [26]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        'Pozitif_Yorum' : all_metrics['Pozitif Yorum'],
        'Negatif_Yorum' : all_metrics['Negatif Yorum'],
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Data Collation

In [27]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Model

In [28]:
label2id = {' O':0, ' Pozitif Yorum':1, ' Negatif Yorum':2}
id2label = {v:k for k,v in label2id.items()}

In [29]:
metric = evaluate.load("seqeval")

In [30]:
model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not i

In [31]:
model.config.num_labels

3

In [52]:
args = TrainingArguments(
    "bert-finetuned-ner2-bist30",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 531
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 335


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 20%|██        | 67/335 [07:17<23:24,  5.24s/it]***** Running Evaluation *****
  Num examples = 67
  Batch size = 8
Trainer is attempting to log a value of "{'precision': 0.0546875, 'recall': 0.10294117647058823, 'f1': 0.07142857142857142, 'number': 68}" of type <class 'dict'> for key "eval/Pozitif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 8}" of type <class 'dict'> for key "eval/Negatif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                                                
 20%|██        | 67/335 [07:33<23:24,  5.24s/it]Saving model checkpoint to bert-finetuned-ner2-bist30/checkpoint-67
Configuration saved in bert-finetuned-ner2-bist30/checkpoint-67/config.json


{'eval_loss': 0.3910437524318695, 'eval_Pozitif_Yorum': {'precision': 0.0546875, 'recall': 0.10294117647058823, 'f1': 0.07142857142857142, 'number': 68}, 'eval_Negatif_Yorum': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 8}, 'eval_precision': 0.05384615384615385, 'eval_recall': 0.09210526315789473, 'eval_f1': 0.0679611650485437, 'eval_accuracy': 0.8520078629598428, 'eval_runtime': 15.6516, 'eval_samples_per_second': 4.281, 'eval_steps_per_second': 0.575, 'epoch': 1.0}


Model weights saved in bert-finetuned-ner2-bist30/checkpoint-67/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner2-bist30/checkpoint-67/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner2-bist30/checkpoint-67/special_tokens_map.json
 40%|████      | 134/335 [14:52<18:03,  5.39s/it] ***** Running Evaluation *****
  Num examples = 67
  Batch size = 8
Trainer is attempting to log a value of "{'precision': 0.10833333333333334, 'recall': 0.19117647058823528, 'f1': 0.13829787234042554, 'number': 68}" of type <class 'dict'> for key "eval/Pozitif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.15384615384615385, 'recall': 0.25, 'f1': 0.1904761904761905, 'number': 8}" of type <class 'dict'> for key "eval/Negatif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
         

{'eval_loss': 0.3016418218612671, 'eval_Pozitif_Yorum': {'precision': 0.10833333333333334, 'recall': 0.19117647058823528, 'f1': 0.13829787234042554, 'number': 68}, 'eval_Negatif_Yorum': {'precision': 0.15384615384615385, 'recall': 0.25, 'f1': 0.1904761904761905, 'number': 8}, 'eval_precision': 0.11278195488721804, 'eval_recall': 0.19736842105263158, 'eval_f1': 0.14354066985645933, 'eval_accuracy': 0.883459702330806, 'eval_runtime': 15.5614, 'eval_samples_per_second': 4.306, 'eval_steps_per_second': 0.578, 'epoch': 2.0}


Model weights saved in bert-finetuned-ner2-bist30/checkpoint-134/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner2-bist30/checkpoint-134/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner2-bist30/checkpoint-134/special_tokens_map.json
 60%|██████    | 201/335 [22:27<11:13,  5.03s/it]***** Running Evaluation *****
  Num examples = 67
  Batch size = 8
Trainer is attempting to log a value of "{'precision': 0.14563106796116504, 'recall': 0.22058823529411764, 'f1': 0.17543859649122806, 'number': 68}" of type <class 'dict'> for key "eval/Pozitif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.16666666666666666, 'recall': 0.375, 'f1': 0.23076923076923078, 'number': 8}" of type <class 'dict'> for key "eval/Negatif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
     

{'eval_loss': 0.2868387699127197, 'eval_Pozitif_Yorum': {'precision': 0.14563106796116504, 'recall': 0.22058823529411764, 'f1': 0.17543859649122806, 'number': 68}, 'eval_Negatif_Yorum': {'precision': 0.16666666666666666, 'recall': 0.375, 'f1': 0.23076923076923078, 'number': 8}, 'eval_precision': 0.1487603305785124, 'eval_recall': 0.23684210526315788, 'eval_f1': 0.182741116751269, 'eval_accuracy': 0.8994664420106712, 'eval_runtime': 15.5823, 'eval_samples_per_second': 4.3, 'eval_steps_per_second': 0.578, 'epoch': 3.0}


Model weights saved in bert-finetuned-ner2-bist30/checkpoint-201/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner2-bist30/checkpoint-201/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner2-bist30/checkpoint-201/special_tokens_map.json
 80%|████████  | 268/335 [30:19<06:49,  6.11s/it]***** Running Evaluation *****
  Num examples = 67
  Batch size = 8
Trainer is attempting to log a value of "{'precision': 0.1702127659574468, 'recall': 0.23529411764705882, 'f1': 0.19753086419753088, 'number': 68}" of type <class 'dict'> for key "eval/Pozitif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.17647058823529413, 'recall': 0.375, 'f1': 0.24, 'number': 8}" of type <class 'dict'> for key "eval/Negatif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                     

{'eval_loss': 0.30167296528816223, 'eval_Pozitif_Yorum': {'precision': 0.1702127659574468, 'recall': 0.23529411764705882, 'f1': 0.19753086419753088, 'number': 68}, 'eval_Negatif_Yorum': {'precision': 0.17647058823529413, 'recall': 0.375, 'f1': 0.24, 'number': 8}, 'eval_precision': 0.17117117117117117, 'eval_recall': 0.25, 'eval_f1': 0.2032085561497326, 'eval_accuracy': 0.8944116821117664, 'eval_runtime': 16.4405, 'eval_samples_per_second': 4.075, 'eval_steps_per_second': 0.547, 'epoch': 4.0}


Model weights saved in bert-finetuned-ner2-bist30/checkpoint-268/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner2-bist30/checkpoint-268/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner2-bist30/checkpoint-268/special_tokens_map.json
100%|██████████| 335/335 [38:18<00:00,  5.27s/it]***** Running Evaluation *****
  Num examples = 67
  Batch size = 8
Trainer is attempting to log a value of "{'precision': 0.2, 'recall': 0.27941176470588236, 'f1': 0.23312883435582823, 'number': 68}" of type <class 'dict'> for key "eval/Pozitif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.16666666666666666, 'recall': 0.375, 'f1': 0.23076923076923078, 'number': 8}" of type <class 'dict'> for key "eval/Negatif_Yorum" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
                     

{'eval_loss': 0.3065752387046814, 'eval_Pozitif_Yorum': {'precision': 0.2, 'recall': 0.27941176470588236, 'f1': 0.23312883435582823, 'number': 68}, 'eval_Negatif_Yorum': {'precision': 0.16666666666666666, 'recall': 0.375, 'f1': 0.23076923076923078, 'number': 8}, 'eval_precision': 0.19469026548672566, 'eval_recall': 0.2894736842105263, 'eval_f1': 0.23280423280423282, 'eval_accuracy': 0.8946925021061499, 'eval_runtime': 15.8911, 'eval_samples_per_second': 4.216, 'eval_steps_per_second': 0.566, 'epoch': 5.0}


Model weights saved in bert-finetuned-ner2-bist30/checkpoint-335/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner2-bist30/checkpoint-335/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner2-bist30/checkpoint-335/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 335/335 [38:47<00:00,  6.95s/it]

{'train_runtime': 2327.5608, 'train_samples_per_second': 1.141, 'train_steps_per_second': 0.144, 'train_loss': 0.29856952268685866, 'epoch': 5.0}





TrainOutput(global_step=335, training_loss=0.29856952268685866, metrics={'train_runtime': 2327.5608, 'train_samples_per_second': 1.141, 'train_steps_per_second': 0.144, 'train_loss': 0.29856952268685866, 'epoch': 5.0})

In [35]:
model.save_pretrained("model/bert-finetuned-ner2-bist30")
tokenizer.save_pretrained("model/tokenizer2")

Configuration saved in model/bert-finetuned-ner2-bist30/config.json
Model weights saved in model/bert-finetuned-ner2-bist30/pytorch_model.bin
tokenizer config file saved in model/tokenizer2/tokenizer_config.json
Special tokens file saved in model/tokenizer2/special_tokens_map.json


('model/tokenizer2/tokenizer_config.json',
 'model/tokenizer2/special_tokens_map.json',
 'model/tokenizer2/vocab.txt',
 'model/tokenizer2/added_tokens.json',
 'model/tokenizer2/tokenizer.json')

In [37]:
token_classifier = pipeline(
    "token-classification", model="model/bert-finetuned-ner2-bist30", aggregation_strategy="simple", tokenizer="model/tokenizer2"
)

loading configuration file model/bert-finetuned-ner2-bist30/config.json
Model config BertConfig {
  "_name_or_path": "model/bert-finetuned-ner2-bist30",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": " O",
    "1": " Pozitif Yorum",
    "2": " Negatif Yorum"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    " Negatif Yorum": 2,
    " O": 0,
    " Pozitif Yorum": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 128000
}

loading configuration file model/bert-finetuned-ner2-bist30/config.json
Mod

In [48]:
ex="#ASELS geri çekilme bekliyorum !"
token_classifier(ex)

[{'entity_group': ' O',
  'score': 0.8734143,
  'word': '# asels',
  'start': 0,
  'end': 6},
 {'entity_group': ' Negatif Yorum',
  'score': 0.4932291,
  'word': 'geri ceki',
  'start': 7,
  'end': 16},
 {'entity_group': ' Pozitif Yorum',
  'score': 0.6130686,
  'word': '##lme bekliyorum!',
  'start': 16,
  'end': 32}]

In [43]:
preds_output = trainer.predict(test_dataset)
y_preds = np.argmax(preds_output.predictions, axis=1)

***** Running Prediction *****
  Num examples = 67
  Batch size = 8


In [44]:
def align_predictions(predictions, label_ids): 
    preds = np.argmax(predictions, axis=2) 
    batch_size, seq_len = preds.shape 
    labels_list, preds_list = [], []
    for batch_idx in range(batch_size): 
        example_labels, example_preds = [], [] 
        for seq_idx in range(seq_len):
                    # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100: 
                example_labels.append(id2label[label_ids[batch_idx][seq_idx]]) 
                example_preds.append(id2label[preds[batch_idx][seq_idx]])
                labels_list.append(example_labels)
                preds_list.append(example_preds)
    return preds_list, labels_list

In [45]:
y_prd, y_tr = align_predictions(preds_output.predictions,preds_output.label_ids) 

In [47]:
from seqeval.metrics import classification_report
print(classification_report(y_tr, y_prd))



               precision    recall  f1-score   support

Negatif Yorum       0.20      0.36      0.26       527
            O       0.39      0.49      0.44      6452
Pozitif Yorum       0.19      0.25      0.21      3628

    micro avg       0.31      0.40      0.35     10607
    macro avg       0.26      0.37      0.30     10607
 weighted avg       0.31      0.40      0.35     10607

