In [None]:
pip install datasets -U

In [None]:
#this code contains how to directly use load_dataset function and convert the data to a form which suitable for transformers

In [None]:
import pandas as pd
from datasets import load_dataset

In [None]:
from sklearn.model_selection import train_test_split
data_csv=pd.read_csv("/content/spam_ham_dataset.csv")
#data_csv=data_csv.head(100)
train_data,test_data=train_test_split(data_csv,test_size=0.2,random_state=42)
train_data.to_csv("/content/train_data.csv")
test_data.to_csv("/content/test_data.csv")

In [None]:
data_files={"train":"/content/train_data.csv", "test":"/content/test_data.csv"}
data=load_dataset("csv",data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'label', 'text', 'label_num'],
        num_rows: 4136
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'Unnamed: 0', 'label', 'text', 'label_num'],
        num_rows: 1035
    })
})

In [None]:
data.keys()

dict_keys(['train', 'test'])

In [None]:
for split in data.keys():
  assert len(data[split])==len(data[split].unique("Unnamed: 0"))

In [None]:
data=data.rename_column(
    original_column_name="Unnamed: 0", new_column_name="id"
)
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'id', 'label', 'text', 'label_num'],
        num_rows: 4136
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'id', 'label', 'text', 'label_num'],
        num_rows: 1035
    })
})

In [None]:
def change_lower(df):
  return{"text":df["text"].lower()}




In [None]:
data=data.map(change_lower)

Map:   0%|          | 0/4136 [00:00<?, ? examples/s]

Map:   0%|          | 0/1035 [00:00<?, ? examples/s]

In [None]:
import html
data=data.map(lambda x: {"text_norm": html.unescape(x["text"])})

Map:   0%|          | 0/4136 [00:00<?, ? examples/s]

Map:   0%|          | 0/1035 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-cased")


def tokenizer_fun(examples):
  return tokenizer(examples["text_norm"],  truncation=True, padding="max_length",max_length=512)



In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
tokenized_data=data.map(tokenizer_fun, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/4136 [00:00<?, ? examples/s]

Map:   0%|          | 0/1035 [00:00<?, ? examples/s]

In [None]:
tokenized_data=tokenized_data.rename_column("label_num","labels")
tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0.1', 'id', 'label', 'text', 'labels', 'text_norm', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4136
    })
    test: Dataset({
        features: ['Unnamed: 0.1', 'id', 'label', 'text', 'labels', 'text_norm', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1035
    })
})

In [None]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(tokenized_data["train"], batch_size=16, shuffle=True)
test_dataloader=DataLoader(tokenized_data["test"], batch_size=16,shuffle=True)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 512]),
 'attention_mask': torch.Size([16, 512])}

In [None]:
from transformers import BertForSequenceClassification
model=BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
device='cuda' if torch.cuda.is_available() else 'cpu'
device
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer=AdamW(model.parameters(), lr=5e-5)
num_epochs=3
num_training_steps=num_epochs*len(train_dataloader)

scheduler=get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
#train the model
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch={k: v.to(device) for k, v in batch.items()}
    outputs=model(**batch)
    loss=outputs.loss
    loss.backward()

    optimizer.step()
    scheduler.step()

    optimizer.zero_grad()

    print(f"epoch:{epoch} , Loss: {loss.item()}")


epoch:0 , Loss: 0.6559979319572449
epoch:0 , Loss: 0.5907232761383057
epoch:0 , Loss: 0.5703838467597961
epoch:0 , Loss: 0.5963366627693176
epoch:0 , Loss: 0.5859978795051575
epoch:0 , Loss: 0.48742225766181946
epoch:0 , Loss: 0.5177549719810486
epoch:0 , Loss: 0.3244902491569519
epoch:0 , Loss: 0.48511406779289246
epoch:0 , Loss: 0.29679834842681885
epoch:0 , Loss: 0.4659445583820343
epoch:0 , Loss: 0.28222954273223877
epoch:0 , Loss: 0.3103661835193634
epoch:0 , Loss: 0.12992607057094574
epoch:0 , Loss: 0.2330240160226822
epoch:0 , Loss: 0.42614519596099854
epoch:0 , Loss: 0.22462481260299683
epoch:0 , Loss: 0.2657441794872284
epoch:0 , Loss: 0.1569427251815796
epoch:0 , Loss: 0.1369248926639557
epoch:0 , Loss: 0.12233452498912811
epoch:0 , Loss: 0.13852274417877197
epoch:0 , Loss: 0.17416058480739594
epoch:0 , Loss: 0.404460608959198
epoch:0 , Loss: 0.23251332342624664
epoch:0 , Loss: 0.2736930549144745
epoch:0 , Loss: 0.15979447960853577
epoch:0 , Loss: 0.393920361995697
epoch:0 , 

In [None]:
model.eval()
total_eval_acc=0
total_eval_loss=0


for batch in test_dataloader:
  batch={k:v.to(device) for k,v in batch.items()}

  with torch.no_grad():
    outputs=model(**batch)

  loss=outputs.loss
  total_eval_loss +=loss.item()

  logits=outputs.logits
  predictions=torch.argmax(logits,dim=-1)
  total_eval_acc +=(predictions == batch['labels']).float().mean()

avg_test_loss = total_eval_loss / len(test_dataloader)
avg_test_accuracy = total_eval_acc / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}, Test Accuracy: {avg_test_accuracy}")

Test Loss: 0.02905968741352598, Test Accuracy: 0.9903846383094788
