In [34]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import json

In [35]:
def jsonl_to_dataframe(jsonl_file):
    data = []

    # Open the JSONL file and load line by line
    with open(jsonl_file, 'r') as file:
        for line in file:
            data.append(json.loads(line))

    # Convert list of dictionaries to a Pandas DataFrame
    df = pd.DataFrame(data)
    return df

df = jsonl_to_dataframe("/content/jutsus.jsonl")
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Adamantine Sealing Chains: Spiral Formation,"Hiden, Ninjutsu, Fūinjutsu, Barrier Ninjutsu, ...",Kushina uses her chains to form a barrier whil...
1,Adamantine Power: Acala,"Kekkei Genkai, Ninjutsu, Taijutsu",Hashirama kicks the opponent away and raises s...
2,Adamantine Prison Wall,"Ninjutsu, Clone Techniques, Bukijutsu","After using Transformation: Adamantine Staff, ..."
3,Adamantine Seal: Monkey Yang Suppression,"Ninjutsu, Fūinjutsu, Cooperation Ninjutsu","After placing fūinjutsu tags in an area, the u..."
4,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ..."


In [36]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass

    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")

    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [37]:
def get_super_classification(jutsu):
  if 'Genjutsu' in jutsu:
    return 'Genjutsu'
  elif 'Taijutsu' in jutsu:
    return 'Taijutsu'
  else:
    return 'Ninjutsu'

df['Super Classification'] = df['jutsu_type'].apply(lambda x: get_super_classification(x))
df = df[['jutsu_description','Super Classification']]

In [38]:
cleaner = Cleaner()
df['text_cleaned'] = df['jutsu_description'].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [39]:
df = df[['text_cleaned','Super Classification']]`

In [111]:
df[df['Super Classification'] == 'Genjutsu']['text']

Unnamed: 0,text
13,The user uses a genjutsu where the whole place...
28,"Using a nearby cactus' pollen, the user places..."
35,This genjutsu is performed with a flute. Becau...
91,This article is about the genjutsu used by Ita...
94,A variation of the Tsukuyomi used by Nanashi U...
...,...
2707,Ino creates an illusion in which countless but...
2719,The user places a hallucinatory darkness on a ...
2777,The user traps their target inside a genjutsu ...
2802,"Itachi traps the opponent in a Tsukuyomi, caus..."


In [112]:
df['text'].loc[2719]

"The user places a hallucinatory darkness on a target's eyesight, causing them to see nothing but black; Tō no Sho likens the sensation to being at the bottom of a deep hole. Because the target cannot see, they are very vulnerable to attack. Although this handicap is dangerous even to the likes of the Third Hokage,[1] it is not insurmountable, as the Third is able to sense attacks to try to defend himself and smell his attackers in order to stage a counterattack. When the Third finally captures the user and begins removing their soul, the darkness disperses, something that Orochimaru, an onlooker, immediately notices.[2]"

In [44]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['Jutsu Labels'] = le.fit_transform(df['Super Classification'])
df.head()

df.shape

(2920, 4)

In [42]:
df.rename(columns={'Jutsu Labels':'labels','text_cleaned':'text'}, inplace=True)

In [None]:
## Since the data is highly skewed so we have to assign weights to the labels
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Jutsu Labels'])



In [59]:
df['labels'].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
1,2193
2,626
0,101


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

model = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model)

def preprocess_function(tokenizer,examples):


  # Flattening if necessary
  # flat_text_cleaned = [item for sublist in examples['text_cleaned'] for item in sublist] if isinstance(examples['text_cleaned'], list) else examples['text_cleaned']

  return tokenizer(examples['text_cleaned'],truncation=True, padding = True,max_length = 512 )

# def get_tokenized_data(data):
#   return tokenizer(data['jutsu_description'], padding=True, truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:

In [None]:
## Create the hugging face dataset
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

train_data = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                            batched=True)
test_data = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                            batched=True)

## tokenize the data
# train_dataset = train_dataset.map(get_tokenized_data, batched=True)
# test_dataset = test_dataset.map(get_tokenized_data, batched=True)

Map:   0%|          | 0/2336 [00:00<?, ? examples/s]

Map:   0%|          | 0/584 [00:00<?, ? examples/s]

In [None]:
token_lengths = [len(input_ids) for input_ids in test_data['input_ids']]

# Display the lengths
for index, length in enumerate(token_lengths):
    print(f"Length of tokenized input {index + 1}: {length}")

Length of tokenized input 1: 512
Length of tokenized input 2: 512
Length of tokenized input 3: 512
Length of tokenized input 4: 512
Length of tokenized input 5: 512
Length of tokenized input 6: 512
Length of tokenized input 7: 512
Length of tokenized input 8: 512
Length of tokenized input 9: 512
Length of tokenized input 10: 512
Length of tokenized input 11: 512
Length of tokenized input 12: 512
Length of tokenized input 13: 512
Length of tokenized input 14: 512
Length of tokenized input 15: 512
Length of tokenized input 16: 512
Length of tokenized input 17: 512
Length of tokenized input 18: 512
Length of tokenized input 19: 512
Length of tokenized input 20: 512
Length of tokenized input 21: 512
Length of tokenized input 22: 512
Length of tokenized input 23: 512
Length of tokenized input 24: 512
Length of tokenized input 25: 512
Length of tokenized input 26: 512
Length of tokenized input 27: 512
Length of tokenized input 28: 512
Length of tokenized input 29: 512
Length of tokenized inp

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [None]:
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'  # Return PyTorch tensors
        )

        labels = torch.FloatTensor(label)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels  # Convert labels to float for multi-label classification
        }

# Step 5: Create datasets and dataloaders
max_length = 512
train_dataset = MultiLabelDataset(
    texts=train_dataset['text_cleaned'],
    labels=train_dataset['Jutsu Labels'],
    tokenizer=tokenizer,
    max_length=max_length
)

test_dataset = MultiLabelDataset(
    texts=test_dataset['text_cleaned'],
    labels=test_dataset['Jutsu Labels'],
    tokenizer=tokenizer,
    max_length=max_length
)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)


In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7ac061418850>

In [18]:
import torch
import huggingface_hub

## login to huggingface hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#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

In [45]:
### Create the label encoder dictionary
df['labels'] = df['labels'].astype('int')
label_dict = dict(zip(df['labels'].unique(), df['Super Classification'].unique()))
label_dict

{1: 'Ninjutsu', 2: 'Taijutsu', 0: 'Genjutsu'}

In [None]:
le.inverse_transform([0,1,2])


array(['Genjutsu', 'Ninjutsu', 'Taijutsu'], dtype=object)

In [None]:
## Creat a custom class weight function to deal with the imbalanced dataset
from sklearn.utils.class_weight import compute_class_weight

def get_class_weights(df):
  class_weights = compute_class_weight('balanced', classes=np.array(sorted(df['Jutsu Labels'].unique())), y= df['Jutsu Labels'].tolist())
  class_weights_2 = dict(zip(df['Jutsu Labels'].unique().tolist(), class_weights))
  return class_weights

all_data = pd.concat([df_train,df_test])

class_weights = get_class_weights(all_data)
class_weights

array([9.6369637 , 0.44383645, 1.55484558])

In [None]:
class_weights_3 = []
class_weights_3.append(class_weights[2])
class_weights_3.append(class_weights[0])
class_weights_3.append(class_weights[1])
class_weights_3

class_weights = torch.tensor(class_weights_3, dtype=torch.float)

In [None]:
label_dict = {int(k): v for k, v in label_dict.items()}
label_dict

{1: 'Ninjutsu', 2: 'Taijutsu', 0: 'Genjutsu'}

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [66]:
# for label_list in df['labels']:
#   print(label_list)

In [14]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import f1_score



# Use MultiLabelBinarizer to convert to binary format
mlb = MultiLabelBinarizer()
df['labels'] = df['labels'].apply(lambda x: str(x))  # Convert to string
mlb.fit(df['labels'])  # Fit on the labels
num_classes = len(mlb.classes_)  # Get the number of classes

# Compute class weights
class_counts = np.zeros(num_classes)
for label_list in df['labels']:
    for label in label_list:
        class_counts[int(label) - 1] += 1

class_weights = class_counts.sum() / (num_classes * class_counts)
class_weights = torch.FloatTensor(class_weights).to(device)  # Convert to FloatTensor

# Step 2: Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Step 3: Initialize the AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Step 4: Create a custom dataset class
class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, num_classes):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = num_classes  # Store the number of classes

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Create a binary vector for the labels
        labels = [0] * self.num_classes  # Initialize a binary vector of zeros
        for l in label:  # Assuming label is a list of integers
            labels[int(l[0]) - 1] = 1  # Set the appropriate indices to 1

        labels = torch.FloatTensor(labels)  # Convert to FloatTensor

        return {
            'input_ids': encoding['input_ids'].flatten().to(device),
            'attention_mask': encoding['attention_mask'].flatten().to(device),
            'labels': labels.to(device)  # Return labels as a 1D tensor
        }

# Step 5: Create datasets and dataloaders
max_length = 512
train_dataset = MultiLabelDataset(
    texts=train_df['text'].tolist(),
    labels=train_df['labels'].tolist(),
    tokenizer=tokenizer,
    max_length=max_length,
    num_classes=3  # Pass the number of classes
)

test_dataset = MultiLabelDataset(
    texts=test_df['text'].tolist(),
    labels=test_df['labels'].tolist(),
    tokenizer=tokenizer,
    max_length=max_length,
    num_classes=3  # Pass the number of classes
)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

# Step 6: Initialize the model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_classes)
model = model.to(device)

# Step 7: Set up the optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Step 8: Define a custom training loop
def custom_loss_function(outputs, labels, class_weights):
    # Use binary cross-entropy loss with logits
    bce_loss = F.binary_cross_entropy_with_logits(outputs, labels, pos_weight=class_weights)
    return bce_loss

# Step 9: Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask).logits
        loss = custom_loss_function(outputs, labels, class_weights)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

# Step 10: Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask).logits
        preds = torch.sigmoid(outputs).cpu().numpy()  # Sigmoid for multi-label
        all_preds.extend(preds)
        all_labels.extend(batch['labels'].cpu().numpy())

# Step 11: Calculate metrics
preds = (np.array(all_preds) > 0.5).astype(int)  # Binarize predictions
accuracy = (preds == all_labels).mean()  # Accuracy calculation
f1 = f1_score(all_labels, preds, average='weighted')  # F1 score calculation

print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 1168/1168 [02:09<00:00,  9.03it/s]


Epoch 1/3, Loss: 0.4364


Training Epoch 2/3: 100%|██████████| 1168/1168 [02:08<00:00,  9.09it/s]


Epoch 2/3, Loss: 0.2054


Training Epoch 3/3: 100%|██████████| 1168/1168 [02:08<00:00,  9.11it/s]


Epoch 3/3, Loss: 0.1169


NameError: name 'f1_score' is not defined

In [20]:
from transformers import AutoTokenizer

# Save the trained model
model.save_pretrained("/content/roberta_multi_text_classifier_model")  # Replace with your model directory
tokenizer.save_pretrained("/content/roberta_multi_text_classifier_tokenizer")  # Save tokenizer

# Upload to Hugging Face Hub
from huggingface_hub import HfApi

# Specify your model repo name, replace with your desired model name
repo_name = "devSubho51347/roberta_multi_label_classifier"  # e.g., "your_username/multi-label-model"

# Create a repo on the hub
api = HfApi()
# api.create_repo(repo_id=repo_name, private=False)  # Set private=True if you want a private repo

# Push the model
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model uploaded to the Hugging Face Hub at: {repo_name}")


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Model uploaded to the Hugging Face Hub at: devSubho51347/roberta_multi_label_classifier


In [113]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("devSubho51347/roberta_multi_label_classifier", num_labels=3)
# model.load_state_dict(torch.load("/content/roberta_multi_text_classifier_model/model.safetensors"))  # Load your trained model weights
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


class_mapping = {0: 1, 1: 2, 2: 3}

converted_labels = []

# Prediction function
def predict(texts):
    model.eval()  # Set model to evaluation mode
    predictions = []

    with torch.no_grad():
        for text in texts:
            # Tokenize the input text
            encoding = tokenizer(
                text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            input_ids = encoding['input_ids'].to(device)  # Move to the same device
            attention_mask = encoding['attention_mask'].to(device)  # Move to the same device

            # Get model predictions
            outputs = model(input_ids, attention_mask=attention_mask).logits
            preds = torch.sigmoid(outputs).cpu().numpy()  # Move to CPU for numpy conversion
            print(preds)
            # Binarize predictions
            binary_preds = (preds > 0.5).astype(int)
            predictions.append(binary_preds[0])  # Append the first item for each prediction

    return predictions

# Example usage
texts_to_predict = [
    "The user places a hallucinatory darkness on a target's eyesight, causing them to see nothing but black; Tō no Sho likens the sensation to being at the bottom of a deep hole. Because the target cannot see, they are very vulnerable to attack. Although this handicap is dangerous even to the likes of the Third Hokage,[1] it is not insurmountable, as the Third is able to sense attacks to try to defend himself and smell his attackers in order to stage a counterattack. When the Third finally captures the user and begins removing their soul, the darkness disperses, something that Orochimaru, an onlooker, immediately notices.[2]"
]

predicted_labels = predict(texts_to_predict)

# Display predicted outputs
for text, pred in zip(texts_to_predict, predicted_labels):
    print(f"Text: {text}\nPredicted labels: {pred}\n")


[[0.10411568 0.06082911 0.9850311 ]]
Text: The user places a hallucinatory darkness on a target's eyesight, causing them to see nothing but black; Tō no Sho likens the sensation to being at the bottom of a deep hole. Because the target cannot see, they are very vulnerable to attack. Although this handicap is dangerous even to the likes of the Third Hokage,[1] it is not insurmountable, as the Third is able to sense attacks to try to defend himself and smell his attackers in order to stage a counterattack. When the Third finally captures the user and begins removing their soul, the darkness disperses, something that Orochimaru, an onlooker, immediately notices.[2]
Predicted labels: [0 0 1]



In [114]:
preds.shape

(584, 3)

In [115]:
class_mapping = {0: 'Ninjutsu', 1: 'Taijutsu', 2: 'Genjutsu'}
for ele in predicted_labels:
  label_indices = np.where(pred == 1)[0]
  original_labels = [class_mapping[idx] for idx in label_indices]
  print(original_labels)

# label_dict[original_labels[0]]

['Genjutsu']


In [57]:
label_dict

{1: 'Ninjutsu', 2: 'Taijutsu', 0: 'Genjutsu'}

In [100]:
ff = le.transform(['Taijutsu'])
ff

array([2])

In [104]:
dd = mlb.transform(['0'])
dd

array([[1, 0, 0]])

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import numpy as np
import evaluate

metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from torch import nn
import gc
# model = 'distilbert/distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model,
                                                                   num_labels= 3,
                                                                   id2label= label_dict,
                                                                   )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    push_to_hub=True,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset = train_data,
    eval_dataset = test_data,
    tokenizer = tokenizer,
    data_collator=data_collator,
    compute_metrics= compute_metrics
)

trainer.set_device(device)
trainer.set_class_weights(class_weights)

trainer.train()

# Flush Memory
del trainer,model
gc.collect()

if device == 'cuda':
    torch.cuda.empty_cache()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  loss_fct = nn.CrossEntropyLoss(weight = torch.tensor(self.class_weights, dtype=torch.float).to(device=self.device))


AttributeError: 'NoneType' object has no attribute 'view'

In [None]:
## Define and Train the model
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
from torch import nn

model = 'distilbert/distilbert-base-uncased'
final_model = AutoModelForSequenceClassification.from_pretrained(model, num_labels=3, id2label=label_dict)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# class CustomDataCollatorWithPadding(DataCollatorWithPadding):
#     def __call__(self, features):
#         # First, pad the input sequences as usual
#         batch = super().__call__(features)

#         # Now, make sure 'labels' are in the batch and are tensors
#         if 'labels' in features[0]:
#             batch['labels'] = torch.tensor([f['labels'] for f in features])

#         return batch


# # Use the custom data collator
# data_collator = CustomDataCollatorWithPadding(tokenizer=tokenizer)


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    remove_unused_columns=False,
    push_to_hub=True,
)

# def compute_loss(final_model, inputs, return_outputs=False):
#         # Forward pass
#         labels = inputs.get("labels")
#         outputs = final_model(**inputs)
#         logits = outputs.get("logits")

#         # Define weighted loss
#         loss_fct = nn.CrossEntropyLoss(weight= class_weights.to(logits.device))
#         loss = loss_fct(logits, labels)

#         return (loss, outputs) if return_outputs else loss

### Create Custom Trainer to incorporate the default loss weights

class CustomTrainer(Trainer):

  def compute_loss(self,model,inputs,return_outputs=False):


      labels = inputs.get("Jutsu Labels")

      # Forward Pass
      outputs = model(**inputs)
      logits = outputs.get("logits")
      logits = logits.float()

      # Compute Custom Loss
      loss_fct = nn.CrossEntropyLoss(weight = torch.tensor(self.class_weights, dtype=torch.float).to(device=self.device))
      loss = loss_fct(logits.view(-1, 3 ),labels.view(-1))
      return (loss,outputs) if return_outputs else loss

  def set_class_weights(self,class_weights):
        self.class_weights = class_weights

  def set_device(self,device):
        self.device = device


# trainer = CustomTrainer(
#     model=final_model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics= compute_metrics,
# )

# trainer.set_device(device)
# trainer.set_class_weights(class_weights_3)

# trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
data_collator

In [None]:
class_weights.values()
