In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import peft


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir(r"C:\Users\ethan\Desktop\scam_detection")

In [3]:
data_df = pd.read_csv('spam.csv', encoding='latin1')
data_df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data_df = data_df.rename(columns={'v1': 'label', 'v2': 'text'})

data_df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn import model_selection
train_df , val_df = model_selection.train_test_split(data_df, test_size=0.3, random_state=42)

In [5]:
import datasets
train_ds = datasets.Dataset.from_pandas(train_df)
val_ds = datasets.Dataset.from_pandas(val_df)

dataset_dict = datasets.DatasetDict({'train': train_ds, 'val': val_ds})

for split in dataset_dict.keys():
    dataset_dict[split] = dataset_dict[split].remove_columns("__index_level_0__")

dataset_dict


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 3900
    })
    val: Dataset({
        features: ['label', 'text'],
        num_rows: 1672
    })
})

In [6]:

def label_to_int(label):
    if label == 'spam':
        return 1
    elif label == 'ham':
        return 0
def convert_labels(example):
    example['label'] = label_to_int(example['label'])
    return example

dataset_dict = dataset_dict.map(convert_labels)

print(dataset_dict['train']['label'][:5])

Map: 100%|██████████| 3900/3900 [00:00<00:00, 57222.06 examples/s]
Map: 100%|██████████| 1672/1672 [00:00<00:00, 56203.73 examples/s]

[1, 0, 0, 0, 0]





In [7]:
np.array(dataset_dict['train']['label']).sum()/len(dataset_dict['train']['label'])

0.13538461538461538

In [8]:
import transformers
model_checkpoint = "distilbert-base-uncased"
id2label = {0: "ham", 1: "spam"}
label2id = {"ham": 0, "spam": 1}

config = transformers.AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model = transformers.AutoModelForSequenceClassification.from_pretrained(model_checkpoint, config=config)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [10]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=False)

def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors="pt", padding="max_length", truncation=True, max_length=512)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
print(type(dataset_dict))

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets


<class 'datasets.dataset_dict.DatasetDict'>


Map: 100%|██████████| 3900/3900 [00:00<00:00, 5926.80 examples/s]
Map: 100%|██████████| 1672/1672 [00:00<00:00, 6949.16 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3900
    })
    val: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1672
    })
})

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions,references=labels)}

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]


In [13]:
example_list = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...", 
                "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 
                "Nah I don't think he goes to usf, he lives around here though",
                "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"]

print("Untrained Model Predictions:")
for text in example_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + " - " + id2label[predictions.tolist()])
    

Untrained Model Predictions:
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... - ham
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's - ham
Nah I don't think he goes to usf, he lives around here though - ham
Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030 - ham


In [14]:
model.train()
model.gradient_checkpointing_enable()
model = peft.prepare_model_for_kbit_training(model)


In [15]:
config = peft.LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["q_lin"],
    lora_dropout=0.05,
    bias = "none",
    task_type="SEQ_CLS"
)

model = peft.get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [16]:
lr = 1e-3
batch_size = 4
num_epochs = 10

training_args = transformers.TrainingArguments(
    output_dir = model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)




In [17]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  5%|▌         | 503/9750 [00:30<09:08, 16.85it/s]

{'loss': 0.0832, 'grad_norm': 0.00033634560531936586, 'learning_rate': 0.0009487179487179487, 'epoch': 0.51}


                                                  
 10%|█         | 975/9750 [01:06<08:46, 16.68it/s]

{'eval_loss': 0.052755072712898254, 'eval_accuracy': {'accuracy': 0.9916267942583732}, 'eval_runtime': 7.6091, 'eval_samples_per_second': 219.738, 'eval_steps_per_second': 54.934, 'epoch': 1.0}


 10%|█         | 1003/9750 [01:08<10:37, 13.71it/s] 

{'loss': 0.0753, 'grad_norm': 0.0001024946614052169, 'learning_rate': 0.0008974358974358974, 'epoch': 1.03}


 15%|█▌        | 1503/9750 [01:38<08:09, 16.86it/s]

{'loss': 0.0493, 'grad_norm': 0.006646420806646347, 'learning_rate': 0.0008461538461538462, 'epoch': 1.54}


                                                   
 20%|██        | 1950/9750 [02:12<08:00, 16.23it/s]

{'eval_loss': 0.03994657099246979, 'eval_accuracy': {'accuracy': 0.993421052631579}, 'eval_runtime': 7.5922, 'eval_samples_per_second': 220.225, 'eval_steps_per_second': 55.056, 'epoch': 2.0}


 21%|██        | 2003/9750 [02:16<07:53, 16.35it/s]  

{'loss': 0.034, 'grad_norm': 0.0003854525275528431, 'learning_rate': 0.0007948717948717948, 'epoch': 2.05}


 26%|██▌       | 2503/9750 [02:46<07:12, 16.77it/s]

{'loss': 0.0295, 'grad_norm': 0.00012184635124867782, 'learning_rate': 0.0007435897435897436, 'epoch': 2.56}


                                                   
 30%|███       | 2925/9750 [03:19<06:49, 16.66it/s]

{'eval_loss': 0.05799700692296028, 'eval_accuracy': {'accuracy': 0.992822966507177}, 'eval_runtime': 7.5117, 'eval_samples_per_second': 222.586, 'eval_steps_per_second': 55.646, 'epoch': 3.0}


 31%|███       | 3003/9750 [03:24<06:52, 16.34it/s]  

{'loss': 0.0225, 'grad_norm': 1.1678788780500327e-07, 'learning_rate': 0.0006923076923076923, 'epoch': 3.08}


 36%|███▌      | 3503/9750 [03:53<06:08, 16.93it/s]

{'loss': 0.0135, 'grad_norm': 0.019177259877324104, 'learning_rate': 0.0006410256410256411, 'epoch': 3.59}


                                                   
 40%|████      | 3900/9750 [04:26<06:02, 16.13it/s]

{'eval_loss': 0.09125298261642456, 'eval_accuracy': {'accuracy': 0.9922248803827751}, 'eval_runtime': 7.8745, 'eval_samples_per_second': 212.332, 'eval_steps_per_second': 53.083, 'epoch': 4.0}


 41%|████      | 4003/9750 [04:33<05:57, 16.06it/s]  

{'loss': 0.0214, 'grad_norm': 5.72961994294019e-07, 'learning_rate': 0.0005897435897435898, 'epoch': 4.1}


 46%|████▌     | 4503/9750 [05:03<05:17, 16.54it/s]

{'loss': 0.0137, 'grad_norm': 0.0002023340784944594, 'learning_rate': 0.0005384615384615384, 'epoch': 4.62}


                                                   
 50%|█████     | 4875/9750 [05:34<05:01, 16.16it/s]

{'eval_loss': 0.06120887026190758, 'eval_accuracy': {'accuracy': 0.992822966507177}, 'eval_runtime': 7.8697, 'eval_samples_per_second': 212.462, 'eval_steps_per_second': 53.115, 'epoch': 5.0}


 51%|█████▏    | 5003/9750 [05:43<04:57, 15.98it/s]  

{'loss': 0.0133, 'grad_norm': 8.894604497378111e-10, 'learning_rate': 0.0004871794871794872, 'epoch': 5.13}


 56%|█████▋    | 5503/9750 [06:13<04:19, 16.38it/s]

{'loss': 0.0016, 'grad_norm': 0.00042926750029437244, 'learning_rate': 0.0004358974358974359, 'epoch': 5.64}


                                                   
 60%|██████    | 5850/9750 [06:43<04:00, 16.22it/s]

{'eval_loss': 0.09116573631763458, 'eval_accuracy': {'accuracy': 0.993421052631579}, 'eval_runtime': 7.7625, 'eval_samples_per_second': 215.394, 'eval_steps_per_second': 53.848, 'epoch': 6.0}


 62%|██████▏   | 6003/9750 [06:52<03:50, 16.25it/s]  

{'loss': 0.0095, 'grad_norm': 1.5902503758269404e-09, 'learning_rate': 0.00038461538461538467, 'epoch': 6.15}


 67%|██████▋   | 6503/9750 [07:23<03:21, 16.12it/s]

{'loss': 0.0015, 'grad_norm': 0.0019626314751803875, 'learning_rate': 0.0003333333333333333, 'epoch': 6.67}


                                                   
 70%|███████   | 6825/9750 [07:50<02:55, 16.69it/s]

{'eval_loss': 0.10798438638448715, 'eval_accuracy': {'accuracy': 0.9922248803827751}, 'eval_runtime': 7.5606, 'eval_samples_per_second': 221.146, 'eval_steps_per_second': 55.287, 'epoch': 7.0}


 72%|███████▏  | 7003/9750 [08:01<02:44, 16.66it/s]

{'loss': 0.004, 'grad_norm': 5.033796157682957e-10, 'learning_rate': 0.00028205128205128203, 'epoch': 7.18}


 77%|███████▋  | 7503/9750 [08:31<02:17, 16.34it/s]

{'loss': 0.0, 'grad_norm': 0.0010034663137048483, 'learning_rate': 0.0002307692307692308, 'epoch': 7.69}


                                                   
 80%|████████  | 7800/9750 [08:56<01:57, 16.65it/s]

{'eval_loss': 0.10188361257314682, 'eval_accuracy': {'accuracy': 0.9922248803827751}, 'eval_runtime': 7.5384, 'eval_samples_per_second': 221.797, 'eval_steps_per_second': 55.449, 'epoch': 8.0}


 82%|████████▏ | 8003/9750 [09:09<01:45, 16.63it/s]

{'loss': 0.0, 'grad_norm': 6.909541383492979e-09, 'learning_rate': 0.0001794871794871795, 'epoch': 8.21}


 87%|████████▋ | 8503/9750 [09:39<01:14, 16.65it/s]

{'loss': 0.0025, 'grad_norm': 8.607633513292925e-14, 'learning_rate': 0.0001282051282051282, 'epoch': 8.72}


                                                   
 90%|█████████ | 8775/9750 [10:03<00:58, 16.62it/s]

{'eval_loss': 0.11799599975347519, 'eval_accuracy': {'accuracy': 0.9922248803827751}, 'eval_runtime': 7.7561, 'eval_samples_per_second': 215.573, 'eval_steps_per_second': 53.893, 'epoch': 9.0}


 92%|█████████▏| 9003/9750 [10:17<00:45, 16.48it/s]

{'loss': 0.0001, 'grad_norm': 5.904910588272472e-11, 'learning_rate': 7.692307692307693e-05, 'epoch': 9.23}


 97%|█████████▋| 9503/9750 [10:48<00:15, 16.38it/s]

{'loss': 0.0, 'grad_norm': 1.1691355661369585e-09, 'learning_rate': 2.564102564102564e-05, 'epoch': 9.74}


                                                   
100%|██████████| 9750/9750 [11:11<00:00, 16.34it/s]

{'eval_loss': 0.11897236108779907, 'eval_accuracy': {'accuracy': 0.9922248803827751}, 'eval_runtime': 7.809, 'eval_samples_per_second': 214.112, 'eval_steps_per_second': 53.528, 'epoch': 10.0}


100%|██████████| 9750/9750 [11:11<00:00, 14.52it/s]

{'train_runtime': 671.4197, 'train_samples_per_second': 58.086, 'train_steps_per_second': 14.521, 'train_loss': 0.019373437700040925, 'epoch': 10.0}





TrainOutput(global_step=9750, training_loss=0.019373437700040925, metrics={'train_runtime': 671.4197, 'train_samples_per_second': 58.086, 'train_steps_per_second': 14.521, 'total_flos': 5241587060736000.0, 'train_loss': 0.019373437700040925, 'epoch': 10.0})

In [18]:
model.to("cuda:0")

print("Trained Model Predictions:")
for text in example_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cuda:0")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + " - " + id2label[predictions.tolist()])


Trained Model Predictions:
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... - ham
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's - spam
Nah I don't think he goes to usf, he lives around here though - ham
Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030 - spam
