# BerTurk

## Importing required libaries

In [None]:
!pip install transformers

In [None]:
!pip install --upgrade transformers
!pip install transformers accelerate

In [None]:
!pip install datasets

In [None]:
!pip install pyarrow 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from nltk.corpus import stopwords
import torch.nn as nn
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import typing
from typing import Dict
import pyarrow as pa
from datasets import Dataset
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

## Dataframe (clean_news.csv) processing-steps for training BerTURK Model

In [None]:
data_path = 'clean_news.csv'
df = pd.read_csv(data_path, error_bad_lines=False)



  df = pd.read_csv(data_path, error_bad_lines=False)


In [None]:
df.head()

Unnamed: 0,Body,Label
0,karadeniz tiyatro festivali zamanı devlet tiya...,gerçek
1,sözde torbacının haklı gururu dün gece suların...,yalan
2,buseyi aramak bin liralık cihazını alıp geldi ...,gerçek
3,west ham southampton maç özet premier ligin ha...,gerçek
4,ak partili yavuz duyurdu yskya ek dilekçe vere...,gerçek


In [None]:
df.shape

(4455, 2)

In [None]:
df1 = pd.DataFrame(df)
print(df1.count())

Body     4455
Label    4455
dtype: int64


In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = AutoModel.from_pretrained("dbmdz/bert-base-turkish-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import re 
def process_data(row):
        # Clean the text
        text = row['Body']
        text = str(text)
        text = ' '.join(text.split())
        # Get tokens
        encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)
        # Convert string to integers
        label = 0
        if row['Label'] == 'gerçek':
            label += 1

        encodings['label'] = label
        encodings['text'] = text

        return encodings

In [None]:
print(process_data({
        'Body': 'this is a body text of news.',
        'Label': 'gerçek'
    }))

{'input_ids': [2, 19792, 2605, 69, 11833, 7542, 29238, 3833, 21938, 1022, 18, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
processed_data = []
for i in range(len(df[:4455])):
 processed_data.append(process_data(df.iloc[i]))

In [None]:
new_df = pd.DataFrame(processed_data)
  
train_df, valid_df = train_test_split(
        new_df,
        test_size=0.2,
        random_state=2022
    )

In [None]:
train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

## Training Stage for BerTURK

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
        'dbmdz/bert-base-turkish-cased',
        num_labels=2
    )

Some weights of the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were 

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="./result", evaluation_strategy="epoch", num_train_epochs= 5.0)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_hg,
        eval_dataset=valid_hg, 
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.09502,0.986532,0.987474,0.987474,0.987474
2,0.165300,0.135654,0.98092,0.987342,0.977035,0.982162
3,0.032900,0.145933,0.982043,0.983299,0.983299,0.983299
4,0.004800,0.128295,0.984287,0.983368,0.987474,0.985417
5,0.000000,0.132765,0.983165,0.983333,0.985386,0.984359


<class 'transformers.trainer_utils.EvalPrediction'>
<class 'transformers.trainer_utils.EvalPrediction'>
<class 'transformers.trainer_utils.EvalPrediction'>
<class 'transformers.trainer_utils.EvalPrediction'>
<class 'transformers.trainer_utils.EvalPrediction'>


TrainOutput(global_step=2230, training_loss=0.045526804603978725, metrics={'train_runtime': 497.8565, 'train_samples_per_second': 35.793, 'train_steps_per_second': 4.479, 'total_flos': 1172159751628800.0, 'train_loss': 0.045526804603978725, 'epoch': 5.0})

In [None]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.13276545703411102,
 'eval_accuracy': 0.9831649831649831,
 'eval_precision': 0.9833333333333333,
 'eval_recall': 0.9853862212943633,
 'eval_f1': 0.9843587069864442,
 'eval_runtime': 6.483,
 'eval_samples_per_second': 137.437,
 'eval_steps_per_second': 17.276,
 'epoch': 5.0}