In [1]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import pathlib
import os


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
dataset_root_path = 'data/literature/literotica/'

In [3]:
labels = os.listdir(dataset_root_path)
ids = range(len(labels))

label2id = dict(zip(labels, ids))
id2label = dict(zip(ids, labels))

print(label2id)
print(id2label)

{'gay': 0, 'bdsm': 1, 'straight': 2, 'anal': 3, 'group': 4, 'sci_fi': 5, 'mature': 6, 'non_human': 7, 'non_consent': 8, 'fetish': 9, 'lesbian': 10, 'romance': 11}
{0: 'gay', 1: 'bdsm', 2: 'straight', 3: 'anal', 4: 'group', 5: 'sci_fi', 6: 'mature', 7: 'non_human', 8: 'non_consent', 9: 'fetish', 10: 'lesbian', 11: 'romance'}


In [23]:
def get_all_stories_paths(dataset_root_path):
    dataset_folder = pathlib.Path(dataset_root_path)
    return dataset_folder.rglob("*.txt")

In [24]:
def create_json_dataset():
    all_stories = []
    
    all_txt_paths = get_all_stories_paths(dataset_root_path)
    
    for story_path in tqdm(all_txt_paths):
        with open(story_path, 'r') as file:
            data = file.read()
            length = len(data.split())
            if length < 2500:
                all_stories.append(
                    {
                        "text": data,
                        "label": label2id[str(story_path).split('/')[3]]
                    }
                )
    
    df = pd.DataFrame.from_records(all_stories)
    
    df_train, df_test = train_test_split(df, test_size=0.20, random_state=42)
    
    df_train.to_json(f"literotica-stories-train.jsonl", orient="records", lines=True)
    df_test.to_json(f"literotica-stories-test.jsonl", orient="records", lines=True)

In [25]:
create_json_dataset()

0it [00:00, ?it/s]

In [4]:
from datasets import load_dataset


data_files = {
    "train": "literotica-stories-train.jsonl", 
    "test": "literotica-stories-test.jsonl"
}

stories_dataset = load_dataset("json", data_files=data_files)
stories_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 98552
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 24638
    })
})

### Prepare data

In [5]:
from transformers import AutoTokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized_stories = stories_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/24638 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Load metric

In [61]:
import evaluate

precision_metric = evaluate.load("precision", average="weighted")
recall_metric = evaluate.load("recall", average="weighted")
accuracy_metric = evaluate.load("accuracy")
    


# clf_metrics = evaluate.combine(["accuracy", "precision", "recall"])

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(precision_metric.compute(predictions=predictions, references = labels, average="weighted"))
    results.update(recall_metric.compute(predictions=predictions, references = labels, average="weighted"))
    results.update(accuracy_metric.compute(predictions=predictions, references = labels))
    return results

### Train model

In [67]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=12, id2label=id2label, label2id=label2id
).to("cuda")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
training_args = TrainingArguments(
    output_dir="genre_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    dataloader_num_workers=12,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_stories["train"],
    eval_dataset=tokenized_stories["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [69]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,Accuracy
1,0.9762,0.956019,0.662885,0.662351,0.662351
2,0.8583,0.909576,0.679079,0.681914,0.681914
3,0.722,0.919096,0.684295,0.683375,0.683375
4,0.6412,0.945616,0.67873,0.682604,0.682604


TrainOutput(global_step=12320, training_loss=0.8325064572420987, metrics={'train_runtime': 2756.7712, 'train_samples_per_second': 142.996, 'train_steps_per_second': 4.469, 'total_flos': 5.222902093302989e+16, 'train_loss': 0.8325064572420987, 'epoch': 4.0})