In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'arbicfakenews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4928076%2F8321284%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240505%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240505T211635Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5aed4b6909594010ae26463ea18d325cdebe84ce0eb87d70d95bb05c4f4ff35229803e6f3d87d711b974457f9ee731cad62963cad137e3e6615b23370f744db9cd95f3f6ae9c89e0ef093d78580011e22ef1b0f6ac1386c6931ff239386561d64b9390796b6e09925c504503f9a24d6157fbb71b1afc64749ea9790ac05244a4038e041386f3ce421417676011ac6863731bb0c01eedff69d8186d04391f93bf2f1404cae5fed2ca7164a6dd8762ef757ff4229d990965f557e20b24da0d63334e69b78256de3706b52e314e6021730ca43923e77656b6f54620feb240e96af3c0730f0f75ebf1163154561b884274bf7af34e220ca01d16e43c9bb3d946d466'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading arbicfakenews, 247202498 bytes compressed
Downloaded and uncompressed: arbicfakenews
Data source import complete.


In [None]:
!pip install transformers



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch

In [None]:
data = pd.read_csv("/kaggle/input/arbicfakenews/final_data.csv")

In [None]:
data.head()

Unnamed: 0,title,text,label
0,"فيديو, هل لديك حساسية طعام؟المدة, 25,18",يعاني الكثير الشباب منطقة الشرق الأوسط وشمال أ...,real
1,اخر الاخبار اليوم محافظ المنيا ورئيس الجامعة ي...,الدكتور مصطفي عبد النبي رئيس جامعة المنيا والل...,fake
2,مدبولي يتابع الموقف التنفيذي لمشروع تطوير وتنم...,وأكد رئيس الوزراء المشروع القومي الكبير سيتم إ...,real
3,تسرب بسببها فصل بالكامل.. فاطمة رشدى ضربت الطا...,شكرا لقرائتكم خبر تسرب بسببها فصل بالكامل فاطم...,fake
4,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,real


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366000 entries, 0 to 365999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   365999 non-null  object
 1   text    365923 non-null  object
 2   label   365924 non-null  object
dtypes: object(3)
memory usage: 8.4+ MB


In [None]:
data.describe().T

Unnamed: 0,count,unique,top,freq
title,365999,357533,نصاف بن علية تتلقّى الجرعة الأولى من لقاح فايزر,6
text,365923,365923,يعاني الكثير الشباب منطقة الشرق الأوسط وشمال أ...,1
label,365924,2,real,203772


In [None]:
data.isnull().sum()

title     1
text     77
label    76
dtype: int64

In [None]:
data = data.dropna()
data = data.dropna(subset=['text', 'label'])

In [None]:
data.isnull().sum()

title    0
text     0
label    0
dtype: int64

In [None]:
data.shape

(365922, 3)

In [None]:
data['text_length'] = data['text'].apply(lambda x: len(str(x)))

In [None]:
data['text_length'].describe
min_length = data['text_length'].min()
min_length

6

In [None]:
num_rows_before = len(data)
data = data[data['text_length'] >= 20]
num_rows_after = len(data)
num_rows_deleted = num_rows_before - num_rows_after
print("Number of rows deleted:", num_rows_deleted)

Number of rows deleted: 15


In [None]:
data['text_length'].describe
min_length = data['text_length'].min()
min_length

20

In [None]:
data['label'].value_counts()

label
real    203766
fake    162141
Name: count, dtype: int64

In [None]:
# Calculate the total number of samples
total_samples = data['label'].count()

# Calculate the percentage for each label
data['label_percentage'] = (data.groupby('label')['label'].transform('count') / total_samples) * 100

# Display the result
print(data[['label', 'label_percentage']].drop_duplicates())

  label  label_percentage
0  real         55.687921
1  fake         44.312079


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label_percentage'] = (data.groupby('label')['label'].transform('count') / total_samples) * 100


In [None]:
data['label'] = data['label'].replace({'fake': 1, 'real': 0})

In [None]:
data = data.drop(['label_percentage'], axis=1)
data.head()

Unnamed: 0,title,text,label,text_length
0,"فيديو, هل لديك حساسية طعام؟المدة, 25,18",يعاني الكثير الشباب منطقة الشرق الأوسط وشمال أ...,0,1566
1,اخر الاخبار اليوم محافظ المنيا ورئيس الجامعة ي...,الدكتور مصطفي عبد النبي رئيس جامعة المنيا والل...,1,981
2,مدبولي يتابع الموقف التنفيذي لمشروع تطوير وتنم...,وأكد رئيس الوزراء المشروع القومي الكبير سيتم إ...,0,2784
3,تسرب بسببها فصل بالكامل.. فاطمة رشدى ضربت الطا...,شكرا لقرائتكم خبر تسرب بسببها فصل بالكامل فاطم...,1,1222
4,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,0,853


In [None]:
data['label'].unique()

array([0, 1])

In [None]:
data['text'] = data.apply(lambda x: x['text'][:200] if x['text_length'] > 200 else x['text'], axis=1)

In [None]:
data['text_length'] = data['text'].apply(lambda x: len(str(x)))

In [None]:
data.head()

Unnamed: 0,title,text,label,text_length
0,"فيديو, هل لديك حساسية طعام؟المدة, 25,18",يعاني الكثير الشباب منطقة الشرق الأوسط وشمال أ...,0,200
1,اخر الاخبار اليوم محافظ المنيا ورئيس الجامعة ي...,الدكتور مصطفي عبد النبي رئيس جامعة المنيا والل...,1,200
2,مدبولي يتابع الموقف التنفيذي لمشروع تطوير وتنم...,وأكد رئيس الوزراء المشروع القومي الكبير سيتم إ...,0,200
3,تسرب بسببها فصل بالكامل.. فاطمة رشدى ضربت الطا...,شكرا لقرائتكم خبر تسرب بسببها فصل بالكامل فاطم...,1,200
4,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,سقوط تشكيل عصابي للاتجار بالمخدرات وحيازة الأس...,0,200


In [None]:
num_rows_before = len(data)
data = data.drop_duplicates(subset=['text'])
num_rows_after = len(data)
num_rows_deleted = num_rows_before - num_rows_after
print("Number of rows deleted:", num_rows_deleted)

Number of rows deleted: 6439


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 359468 entries, 0 to 365999
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        359468 non-null  object
 1   text         359468 non-null  object
 2   label        359468 non-null  int64 
 3   text_length  359468 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 13.7+ MB


In [None]:
total_samples = data['label'].count()
data['label_percentage'] = (data.groupby('label')['label'].transform('count') / total_samples) * 100
print(data[['label', 'label_percentage']].drop_duplicates())

   label  label_percentage
0      0         55.948235
1      1         44.051765


In [None]:
df = data.sample(n=22
                 000, random_state=42)

In [None]:
df.info()
#df=data

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 317724 to 160440
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             20000 non-null  object 
 1   text              20000 non-null  object 
 2   label             20000 non-null  int64  
 3   text_length       20000 non-null  int64  
 4   label_percentage  20000 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 937.5+ KB


In [None]:
total_samples = df['label'].count()
df['label_percentage'] = (df.groupby('label')['label'].transform('count') / total_samples) * 100
print(df[['label', 'label_percentage']].drop_duplicates())

        label  label_percentage
317724      1              44.1
151427      0              55.9


=====================================================================================================

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2)

In [None]:
train_texts.shape

(16000,)

In [None]:
test_texts.shape

(4000,)

In [None]:
train_labels.shape

(16000,)

In [None]:
test_labels.shape

(4000,)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Load the Arabic BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# Tokenize training texts
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=200, return_tensors="pt")

# Tokenize test texts
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=200, return_tensors="pt")


In [None]:
train_encodings

{'input_ids': tensor([[   33, 32232, 37684,  ...,    31,    31,    31],
        [   33,  1039,  7214,  ...,    31,    31,    31],
        [   33,  9339,  1258,  ...,    31,    31,    31],
        ...,
        [   33,  1386,   210,  ...,    31,    31,    31],
        [   33, 11601, 28783,  ...,    31,    31,    31],
        [   33, 11601, 28783,  ...,    31,    31,    31]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
test_encodings

{'input_ids': tensor([[  33, 1528, 2057,  ...,   31,   31,   31],
        [  33, 2164, 4297,  ...,   31,   31,   31],
        [  33, 2072, 7214,  ...,   31,   31,   31],
        ...,
        [  33, 6284, 9684,  ...,   31,   31,   31],
        [  33, 1774, 7214,  ...,   31,   31,   31],
        [  33, 1039,  210,  ...,   31,   31,   31]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.tolist()))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels.tolist()))


In [None]:
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x7b80d903bbe0>

In [None]:
test_dataset

<torch.utils.data.dataset.TensorDataset at 0x7b7fd48adae0>

In [None]:
# DataLoader
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=16)


In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7b7fd48ac190>

In [None]:
test_loader

<torch.utils.data.dataloader.DataLoader at 0x7b7fd48ae290>

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Load the Arabic BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")

# Load the Arabic BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2")

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

epochs = 16
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
"""for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_mask, b_labels = batch
        optimizer.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    print(f"Epoch {epoch+1}/{epochs} Loss: {total_loss/len(train_loader)}")

model.eval()
predictions, true_labels = [], []"""



'for epoch in range(epochs):\n    model.train()\n    total_loss = 0\n    for batch in train_loader:\n        batch = tuple(t.to(device) for t in batch)\n        b_input_ids, b_attention_mask, b_labels = batch\n        optimizer.zero_grad()\n\n        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)\n\n        loss = outputs.loss\n        total_loss += loss.item()\n        loss.backward()\n        optimizer.step()\n        scheduler.step()\n    print(f"Epoch {epoch+1}/{epochs} Loss: {total_loss/len(train_loader)}")\n\nmodel.eval()\npredictions, true_labels = [], []'

In [None]:
"""for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attention_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
pred_flat = np.argmax(predictions, axis=1).flatten()
labels_flat = true_labels.flatten()

accuracy = accuracy_score(labels_flat, pred_flat)
print(f"Test Accuracy: {accuracy}")"""

'for batch in test_loader:\n    batch = tuple(t.to(device) for t in batch)\n    b_input_ids, b_attention_mask, b_labels = batch\n\n    with torch.no_grad():\n        outputs = model(b_input_ids, attention_mask=b_attention_mask)\n\n    logits = outputs[0]\n    logits = logits.detach().cpu().numpy()\n    label_ids = b_labels.to(\'cpu\').numpy()\n\n    predictions.append(logits)\n    true_labels.append(label_ids)\n\npredictions = np.concatenate(predictions, axis=0)\ntrue_labels = np.concatenate(true_labels, axis=0)\npred_flat = np.argmax(predictions, axis=1).flatten()\nlabels_flat = true_labels.flatten()\n\naccuracy = accuracy_score(labels_flat, pred_flat)\nprint(f"Test Accuracy: {accuracy}")'

mmm


In [None]:
from sklearn.metrics import accuracy_score

for epoch in range(epochs):
    model.train()
    total_loss = 0
    predictions, true_labels = [], []

    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_mask, b_labels = batch
        optimizer.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)

        loss = outputs.loss
        total_loss += loss.item()
        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())

        loss.backward()
        optimizer.step()
        scheduler.step()

    epoch_loss = total_loss / len(train_loader)
    epoch_accuracy = accuracy_score(true_labels, predictions)
    print(f"Epoch {epoch+1}/{epochs} Loss: {epoch_loss:.4f} Accuracy: {epoch_accuracy:.4f}")

Epoch 1/16 Loss: 0.5311 Accuracy: 0.6897
Epoch 2/16 Loss: 0.4069 Accuracy: 0.7751
Epoch 3/16 Loss: 0.3672 Accuracy: 0.8030
Epoch 4/16 Loss: 0.3138 Accuracy: 0.8381
Epoch 5/16 Loss: 0.2656 Accuracy: 0.8725
Epoch 6/16 Loss: 0.2042 Accuracy: 0.9086
Epoch 7/16 Loss: 0.1455 Accuracy: 0.9373
Epoch 8/16 Loss: 0.1018 Accuracy: 0.9592
Epoch 9/16 Loss: 0.0786 Accuracy: 0.9688
Epoch 10/16 Loss: 0.0519 Accuracy: 0.9811
Epoch 11/16 Loss: 0.0344 Accuracy: 0.9872
Epoch 12/16 Loss: 0.0262 Accuracy: 0.9909
Epoch 13/16 Loss: 0.0209 Accuracy: 0.9934
Epoch 14/16 Loss: 0.0154 Accuracy: 0.9951
Epoch 15/16 Loss: 0.0112 Accuracy: 0.9960
Epoch 16/16 Loss: 0.0093 Accuracy: 0.9968


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

model.eval()
predictions, true_labels = [], []

for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attention_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.extend(np.argmax(logits, axis=1).flatten())
    true_labels.extend(label_ids.flatten())

# Calculate evaluation metrics
print(classification_report(true_labels, predictions))
print("Confusion Matrix:")
print(confusion_matrix(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      2193
           1       0.80      0.77      0.78      1807

    accuracy                           0.81      4000
   macro avg       0.81      0.81      0.81      4000
weighted avg       0.81      0.81      0.81      4000

Confusion Matrix:
[[1836  357]
 [ 410 1397]]


In [1]:
from sklearn.metrics import precision_recall_fscore_support

# Calculate precision, recall, f1-score, and support
precision, recall, f1_score, support = precision_recall_fscore_support(labels_flat, pred_flat)

# Print precision, recall, f1-score, and support
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Support:", support)

NameError: name 'labels_flat' is not defined

In [None]:
from sklearn.metrics import confusion_matrix


# Calculate confusion matrix
conf_matrix = confusion_matrix(labels_flat, pred_flat)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
import joblib
# Save the model
model_path = "bert_sequence_classification_model.joblib"
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")