In [None]:
import os
import re
import jsonlines
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, load_metric
import numpy as np
import torch


In [None]:


def read_and_clean_data(file_path, encoding='latin-1'):
    with open(file_path, encoding=encoding) as f:
        text = f.read()
    text = re.sub(r'^\d+\.\d+\s*|¬ß \d+\.\s*', '', text, flags=re.M)  # remove  ¬ß
    text = re.sub(r'\.\.\.', '.', text)  # # remove ...
    text = re.sub(r'["‚Äú‚Äù]', '', text)  #  remove ""
    text = re.sub(r':', '', text)  # remove :
    text = re.sub(r'[0-9]+', '', text)  #  remove 0~9
    text = re.sub(r'([.?!])\s+', r'\1\n', text)  # add \n
    text = re.sub(r'^[\.]|\.\s(?=$|\s)', '', text)  #  remove .
    text = re.sub(r'&', 'and', text)  # replace & toand
    text = re.sub(r'\s{2,}', ' ', text)  # move extra space
    text = text.lower()  # lowers
    sentences = re.split(r'\n', text)
    cleaned_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return cleaned_sentences

In [None]:
folder_path = '/home/nlplab/coffee/NLP_final/data/'
buddhantao_text = read_and_clean_data(os.path.join(folder_path, 'buddhantao.txt'))
notbuddhantao_text = read_and_clean_data(os.path.join(folder_path, 'notbuddhantao.txt'))


In [None]:

data_list = [{'label': 0, 'text': sentence} for sentence in buddhantao_text] + [{'label': 1, 'text': sentence} for sentence in notbuddhantao_text]



In [None]:

with jsonlines.open(os.path.join(folder_path, 'data3.jsonlines'), mode='w') as writer:
    for data in data_list:
        writer.write(data)


In [None]:


dataset = load_dataset("json", data_files=os.path.join(folder_path, 'data3.jsonlines'))


Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset_train_test = dataset["train"].train_test_split(test_size=1000)
dataset["test"] = dataset_train_test["test"]


In [None]:


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)



In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/9365 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
accuracy = load_metric("accuracy")


  accuracy = load_metric("accuracy")


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:

training_args = TrainingArguments(
    output_dir="my_good_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


In [None]:

# Ë®ìÁ∑¥
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.020689,0.992
2,0.034400,0.001624,0.999
3,0.034400,0.000257,1.0




TrainOutput(global_step=879, training_loss=0.024907504874827242, metrics={'train_runtime': 138.3909, 'train_samples_per_second': 203.012, 'train_steps_per_second': 6.352, 'total_flos': 510934000927056.0, 'train_loss': 0.024907504874827242, 'epoch': 3.0})

In [None]:

model.save_pretrained("/home/nlplab/coffee/NLP_final/data/my_good_model")
tokenizer.save_pretrained("/home/nlplab/coffee/NLP_final/data/my_good_model")

('/home/nlplab/coffee/NLP_final/data/my_good_model/tokenizer_config.json',
 '/home/nlplab/coffee/NLP_final/data/my_good_model/special_tokens_map.json',
 '/home/nlplab/coffee/NLP_final/data/my_good_model/vocab.txt',
 '/home/nlplab/coffee/NLP_final/data/my_good_model/added_tokens.json',
 '/home/nlplab/coffee/NLP_final/data/my_good_model/tokenizer.json')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:

model_path = "/home/nlplab/coffee/NLP_final/data/my_good_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:

text = "it would be a skillful bodily act with happy consequences, happy results, then any bodily act of that sort is fit for you to do."


encoded_input = tokenizer(text, return_tensors="pt")


In [None]:

with torch.no_grad():
    outputs = model(**encoded_input)
    logits = outputs.logits


In [None]:

predictions = torch.argmax(logits, dim=-1)
predicted_label = predictions.item()


In [None]:

id2label = {0: "It's Buddha & Tao!", 1: "It's NOT Buddha & Tao"}
predicted_label_str = id2label[predicted_label]

print(f"Predicted label: {predicted_label_str}")

Predicted label: It's Buddha & Tao!
