In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from torch import cuda
from datasets import load_dataset
import datasets
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

In [81]:
df = pd.read_csv('damla/dataset.csv')

In [83]:
df.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,label
0,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,Gerçek
1,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,Gerçek
2,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,Gerçek
3,221226,📍Fortune 500 listesinde 2020 ve 2021’de yer al...,2022-07-08 13:02:50+00:00,ibrahim___ethem,62,665,136873,False,Gerçek
4,221227,Bi tatile gidip geldik 4-5 gün neler olmuş öyl...,2022-07-08 07:14:06+00:00,____PASA____,79,1647,111568,False,Gerçek


In [86]:
len(df[df.label == 'Gerçek']), len(df[df.label == 'Yargı'])

(1222, 1136)

In [88]:
df2 = pd.concat([df.drop(['label'], axis=1), df['label'].str.get_dummies()], axis=1)

In [89]:
df2.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,Gerçek,Yargı
0,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,1,0
1,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,1,0
2,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,1,0
3,221226,📍Fortune 500 listesinde 2020 ve 2021’de yer al...,2022-07-08 13:02:50+00:00,ibrahim___ethem,62,665,136873,False,1,0
4,221227,Bi tatile gidip geldik 4-5 gün neler olmuş öyl...,2022-07-08 07:14:06+00:00,____PASA____,79,1647,111568,False,1,0


In [90]:
df2.drop(['Yargı'], axis=1, inplace=True)
# 1 for Gerçek 0 for Yargı
df2.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,Gerçek
0,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,1
1,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,1
2,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,1
3,221226,📍Fortune 500 listesinde 2020 ve 2021’de yer al...,2022-07-08 13:02:50+00:00,ibrahim___ethem,62,665,136873,False,1
4,221227,Bi tatile gidip geldik 4-5 gün neler olmuş öyl...,2022-07-08 07:14:06+00:00,____PASA____,79,1647,111568,False,1


In [92]:
df2.rename(columns={'Gerçek': 'label'}, inplace=True)
df2.head()

Unnamed: 0,id,text,date,user,rt,fav,followers,verified,label
0,221222,Bloomberg Businessweek’in 3 Temmuz tarihli öze...,2022-07-10 10:16:06+00:00,ibrahim___ethem,20,343,136873,False,1
1,221223,"en UCUZ #BIST 100 #hisse'leri\n#SAHOL 18,68&gt...",2022-07-09 20:53:12+00:00,ASIM_YALCINKAYA,107,934,138759,False,1
2,221225,📍İstanbul Havalimanı dün tarihinin en yüksek u...,2022-07-09 07:01:24+00:00,ibrahim___ethem,23,379,136873,False,1
3,221226,📍Fortune 500 listesinde 2020 ve 2021’de yer al...,2022-07-08 13:02:50+00:00,ibrahim___ethem,62,665,136873,False,1
4,221227,Bi tatile gidip geldik 4-5 gün neler olmuş öyl...,2022-07-08 07:14:06+00:00,____PASA____,79,1647,111568,False,1


In [94]:
len(df2[df2.label == 1]), len(df2[df2.label == 0])

(1222, 1136)

# Train - Test - Validation Split

In [97]:
train_df = df2.sample(frac=0.8, random_state=42)
test_df = df2.drop(train_df.index)
val_df = test_df.sample(frac=0.5, random_state=42)
test_df = test_df.drop(val_df.index)
print("Train :", len(train_df), "Test :", len(test_df), "Val :", len(val_df))

Train : 1886 Test : 236 Val : 236


In [99]:
tr_df = train_df.drop(['id', 'rt', 'fav', 'date', 'user', 'followers', 'verified'], axis=1, inplace=False)
tst_df = test_df.drop(['id', 'rt', 'fav', 'date', 'user', 'followers', 'verified'], axis=1, inplace=False)
vl_df = val_df.drop(['id', 'rt', 'fav', 'date', 'user', 'followers', 'verified'], axis=1, inplace=False)

In [102]:
tr_df.reset_index(drop=True, inplace=True)
tst_df.reset_index(drop=True, inplace=True)
vl_df.reset_index(drop=True, inplace=True)

In [105]:
train_dataset = datasets.Dataset.from_pandas(tr_df)
test_dataset = datasets.Dataset.from_pandas(tst_df)
val_dataset = datasets.Dataset.from_pandas(vl_df)

In [107]:
train_dataset[0]

{'text': 'Garanti Yatırım, İş Bankası için tavsiyesini "endekse paralel getiri" olarak korudu, yeni hedef fiyatı 12,50 TL  #ISCTR',
 'label': 1}

# Model

In [108]:
model_path = "dbmdz/bert-base-turkish-128k-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_path, max_len=512)

loading file https://huggingface.co/dbmdz/bert-base-turkish-128k-uncased/resolve/main/vocab.txt from cache at /Users/damlakonur/.cache/huggingface/transformers/96f3819e738b477201836364517d5979cdbdc6db98ff824a9d1d1918b4ea4cdf.e973deaebec490bbf506dc57141eecbe7c606e78ad5b55fbe9b2c9162abad092
loading file https://huggingface.co/dbmdz/bert-base-turkish-128k-uncased/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/dbmdz/bert-base-turkish-128k-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/dbmdz/bert-base-turkish-128k-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/dbmdz/bert-base-turkish-128k-uncased/resolve/main/tokenizer_config.json from cache at /Users/damlakonur/.cache/huggingface/transformers/1884d6c50b125149343c930759fdcd2bac71b3a4f7181f79d0ba0ac81c927dae.1234e3020e8b22f6151b88ea98a593213c8b28579933530baa777c65097a4e37
loading configuration file https://h

In [109]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [110]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

100%|██████████| 2/2 [00:03<00:00,  1.61s/ba]
100%|██████████| 1/1 [00:00<00:00, 35.85ba/s]
100%|██████████| 1/1 [00:00<00:00, 39.15ba/s]


In [113]:
tokenized_train

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1886
})

# Data Collator

Use DataCollatorWithPadding to create a batch of examples. It will also dynamically pad your text to the length of the longest element in its batch, so they are a uniform length. While it is possible to pad your text in the tokenizer function by setting padding=True, dynamic padding is more efficient.

In [115]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [117]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

loading configuration file https://huggingface.co/dbmdz/bert-base-turkish-128k-uncased/resolve/main/config.json from cache at /Users/damlakonur/.cache/huggingface/transformers/120e27321f5f101e4616b430bb300523eb0c464006badb271fc4a80ecb3f4551.453a629e781b4c858049daeb69936fc02d2ee7e3314c6c65fa5f432c13470419
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-turkish-128k-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 128000
}

loading weights file https://huggingface.co/dbmdz/bert-base-turkish-128k-uncased/resolve/main/py

# Training 

In [120]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds - pred.predictions.argmax(-1)
    Precision, Recall, f1, _ = Precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': Precision,
        'Recall': Recall
    }

In [121]:
training_args = TrainingArguments(
    output_dir="bc-results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [122]:
trainer.train()

***** Running training *****
  Num examples = 1886
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 590
  0%|          | 0/590 [00:00<?, ?it/s]The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
 20%|██        | 118/590 [29:10<2:07:54, 16.26s/it]***** Running Evaluation *****
  Num examples = 236
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


NameError: name 'preds' is not defined