In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
import peft

In [2]:
os.chdir(r"/Users/ethanhuang/Desktop/scam_detection")

In [3]:
data_df = pd.read_csv('spam.csv', encoding='latin1')
data_df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data_df = data_df.rename(columns={'v1': 'label', 'v2': 'text'})

data_df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn import model_selection
train_df , val_df = model_selection.train_test_split(data_df, test_size=0.3, random_state=42)

In [50]:
import datasets
train_ds = datasets.Dataset.from_pandas(train_df)
val_ds = datasets.Dataset.from_pandas(val_df)

dataset_dict = datasets.DatasetDict({'train': train_ds, 'val': val_ds})

for split in dataset_dict.keys():
    dataset_dict[split] = dataset_dict[split].remove_columns("__index_level_0__")

dataset_dict


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 3900
    })
    val: Dataset({
        features: ['label', 'text'],
        num_rows: 1672
    })
})

In [56]:

def label_to_int(label):
    if label == 'spam':
        return 1
    elif label == 'ham':
        return 0
def convert_labels(example):
    example['label'] = label_to_int(example['label'])
    return example

dataset_dict = dataset_dict.map(convert_labels)

print(dataset_dict['train']['label'][:5])


[A
[A
[A
[A
[A

[1, 0, 0, 0, 0]


In [57]:
np.array(dataset_dict['train']['label']).sum()/len(dataset_dict['train']['label'])

0.13538461538461538

In [58]:
import transformers
model_checkpoint = "distilbert-base-uncased"
id2label = {0: "ham", 1: "spam"}
label2id = {"ham": 0, "spam": 1}

config = transformers.AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model = transformers.AutoModelForSequenceClassification.from_pretrained(model_checkpoint, config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [60]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=False)

def tokenize_function(examples):
    return tokenizer(examples["text"], return_tensors="pt", padding="max_length", truncation=True, max_length=512)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
print(type(dataset_dict))

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets


<class 'datasets.dataset_dict.DatasetDict'>



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3900
    })
    val: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1672
    })
})

In [61]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [62]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions,references=labels)}

In [63]:
example_list = ["Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...", 
                "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 
                "Nah I don't think he goes to usf, he lives around here though",
                "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"]

print("Untrained Model Predictions:")
for text in example_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + " - " + id2label[predictions.tolist()])
    

Untrained Model Predictions:
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat... - ham
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's - ham
Nah I don't think he goes to usf, he lives around here though - ham
Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030 - ham


In [64]:
model.train()
model.gradient_checkpointing_enable()
model = peft.prepare_model_for_kbit_training(model)


In [65]:
config = peft.LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["q_lin"],
    lora_dropout=0.05,
    bias = "none",
    task_type="SEQ_CLS"
)

model = peft.get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [66]:
lr = 1e-3
batch_size = 4
num_epochs = 10

training_args = transformers.TrainingArguments(
    output_dir = model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)




In [67]:
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  0%|          | 0/9750 [10:48<?, ?it/s]
  0%|          | 0/9750 [10:14<?, ?it/s]
  0%|          | 20/9750 [01:14<9:45:07,  3.61s/it] 

KeyboardInterrupt: 