In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import re
from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertForSequenceClassification, AutoModelForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
import random
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.nn import CrossEntropyLoss
import nltk
from transformers import DistilBertTokenizer
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("dataset/combined_GH_HF_manual.csv")

# Function to clean text columns
def clean_text(text):
    # Remove non-ASCII characters (corrupted/malformed characters)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

# Apply cleaning function to relevant columns
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].notna() & (df['message'] != '') & (df['message'].str.split().str.len() > 1)]
df["label"] = df["label"].str.lower()

number_of_labels = df["label"].value_counts()
number_of_labels

label
external documentation       501
model structure              195
project metadata             141
sharing                      127
preprocessing                 61
training infrastructure       55
validation infrastructure     52
input data                    35
internal documentation        35
pipeline performance          33
parameter tuning              31
add dependency                19
output data                   18
update dependency             17
remove dependency             15
Name: count, dtype: int64

In [6]:
label_encoder = LabelEncoder()
label_encoder.fit(df["label"])  # Fit on the original labels

df['encoded_labels'] = label_encoder.fit_transform(df['label'])  # Encode the labels as integers
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print(label_mapping)

# Now split the dataset
data_texts = df['message'].tolist()  # Your text data
data_labels = df['encoded_labels'].tolist()  # Your encoded integer labels

# Split into Train and Validation (80% train, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0, shuffle=True)

# Further split Train data into Train and Test (99% train, 1% test)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0, shuffle=True)

# Check the number of items in each split
print(f"Training data: {len(train_texts)}")
print(f"Validation data: {len(val_texts)}")
print(f"Test data: {len(test_texts)}")

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Preprocessing function to tokenize the data
def preprocess_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, return_tensors="pt")

# Tokenize the train, validation, and test data
train_encodings = preprocess_function(train_texts)
val_encodings = preprocess_function(val_texts)
test_encodings = preprocess_function(test_texts)

# Check the lengths of the encoded data
print(len(train_encodings["input_ids"]))  # Should match len(train_texts)
print(len(val_encodings["input_ids"]))    # Should match len(val_texts)
print(len(test_encodings["input_ids"]))   # Should match len(test_texts)

# Now create the datasets with integer labels
train_dataset = [{
    "input_ids": enc, 
    "attention_mask": train_encodings["attention_mask"][i], 
    "labels": torch.tensor(train_labels[i])  # Ensure labels are a tensor (integers)
} for i, enc in enumerate(train_encodings["input_ids"])]

val_dataset = [{
    "input_ids": enc, 
    "attention_mask": val_encodings["attention_mask"][i], 
    "labels": torch.tensor(val_labels[i])  # Ensure labels are a tensor (integers)
} for i, enc in enumerate(val_encodings["input_ids"])]

test_dataset = [{
    "input_ids": enc, 
    "attention_mask": test_encodings["attention_mask"][i], 
    "labels": torch.tensor(test_labels[i])  # Ensure labels are a tensor (integers)
} for i, enc in enumerate(test_encodings["input_ids"])]

# Check the first tokenized data example from the train dataset
print(train_dataset[0])

{0: 'add dependency', 1: 'external documentation', 2: 'input data', 3: 'internal documentation', 4: 'model structure', 5: 'output data', 6: 'parameter tuning', 7: 'pipeline performance', 8: 'preprocessing', 9: 'project metadata', 10: 'remove dependency', 11: 'sharing', 12: 'training infrastructure', 13: 'update dependency', 14: 'validation infrastructure'}
Training data: 1092
Validation data: 277
Test data: 12
1092
277
12
{'input_ids': tensor([  101, 10651,  3808,  1035,  3556,  1012,  1052,  2100,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,

In [7]:
import optuna
import numpy as np
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=1
    )
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Define the objective function for Optuna optimization
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 2e-5)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-4, 0.1)  # Now tuning weight decay

    # Reinitialize model for each trial to avoid contamination
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=15)

    # Set training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,  # Ensure eval batch size matches
        num_train_epochs=7,
        weight_decay=weight_decay,
        evaluation_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        save_strategy="epoch",  # Save best model per epoch
        load_best_model_at_end=True,  # Load the best model automatically
        metric_for_best_model="eval_loss",
        greater_is_better=False,  # We want lower validation loss
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Stop if no improvement in 2 epochs
    )

    # Train and return validation loss
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results['eval_loss']

# Optimize the hyperparameters
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2)

# Output the best hyperparameters found
print("Best hyperparameters:", study.best_params)


[I 2025-03-08 23:12:57,737] A new study created in memory with name: no-name-20a03cdb-950e-4746-8db8-01bb44690acb
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 2e-5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-4, 0.1)  # Now tuning weight decay
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.682,1.488799,0.599251,0.789984,0.599251,0.524755
2,1.407,1.272926,0.651685,0.783567,0.651685,0.587409
3,1.141,1.204599,0.64794,0.733516,0.64794,0.577464
4,1.0349,1.116121,0.681648,0.761606,0.681648,0.622491
5,0.9247,1.085882,0.696629,0.755504,0.696629,0.644554
6,1.0172,1.078118,0.692884,0.767491,0.692884,0.632926
7,0.9442,1.072241,0.689139,0.726281,0.689139,0.633593


[I 2025-03-08 23:37:40,934] Trial 0 finished with value: 1.0722405910491943 and parameters: {'learning_rate': 1.6197048683804554e-05, 'batch_size': 8, 'weight_decay': 0.020932077220795306}. Best is trial 0 with value: 1.0722405910491943.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 2e-5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-4, 0.1)  # Now tuning weight decay
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.7539,1.582125,0.576779,0.727629,0.576779,0.482579
2,1.4552,1.317827,0.621723,0.727495,0.621723,0.54505
3,1.2238,1.215786,0.640449,0.725154,0.640449,0.574752
4,1.0745,1.141149,0.662921,0.746711,0.662921,0.59595
5,1.0128,1.11975,0.689139,0.758895,0.689139,0.640064
6,0.9981,1.103964,0.677903,0.736976,0.677903,0.616111
7,0.9428,1.09819,0.677903,0.736976,0.677903,0.616111


[I 2025-03-09 00:04:34,798] Trial 1 finished with value: 1.0981897115707397 and parameters: {'learning_rate': 1.4573014364047723e-05, 'batch_size': 8, 'weight_decay': 0.0004745658869563308}. Best is trial 0 with value: 1.0722405910491943.


Best hyperparameters: {'learning_rate': 1.6197048683804554e-05, 'batch_size': 8, 'weight_decay': 0.020932077220795306}


In [11]:
import numpy as np
import torch.nn as nn
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=1
    )
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Define the model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=15)

# Set up training arguments with best hyperparameters
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=10,  # Now set to final training epochs
    weight_decay=best_params["weight_decay"],
    logging_dir="./logs",
    logging_steps=10
)

# Subclass Trainer to remove weighted loss function
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Instantiate and train with best hyperparameters
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=15)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.6821,1.516419,0.588015,0.785686,0.588015,0.521193
2,1.4161,1.280272,0.640449,0.789482,0.640449,0.585507
3,1.1004,1.199767,0.651685,0.729742,0.651685,0.58337
4,1.0483,1.107124,0.681648,0.775472,0.681648,0.621044
5,0.9453,1.087859,0.692884,0.745484,0.692884,0.64273
6,0.9784,1.064542,0.696629,0.739138,0.696629,0.643155
7,0.922,1.063261,0.692884,0.735425,0.692884,0.639714


TrainOutput(global_step=931, training_loss=1.2256117642373967, metrics={'train_runtime': 1500.0455, 'train_samples_per_second': 4.933, 'train_steps_per_second': 0.621, 'total_flos': 980353511746560.0, 'train_loss': 1.2256117642373967, 'epoch': 7.0})

In [None]:
import matplotlib.pyplot as plt

train_loss = []
eval_loss = []
epochs = []  # Store unique epoch numbers

for log in trainer.state.log_history:
    if "loss" in log and "epoch" in log:
        train_loss.append(log["loss"])
    if "eval_loss" in log and "epoch" in log:
        eval_loss.append(log["eval_loss"])
        epochs.append(log["epoch"])  # Extract the actual epoch number

# Plot the losses
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(train_loss) + 1), train_loss, label="Training Loss", color="blue", linestyle="--")
plt.plot(epochs, eval_loss, label="Validation Loss", color="red", marker="o")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Validation Loss per Epoch")
plt.legend()
plt.grid()
plt.show()


In [None]:
config = DistilBertConfig.from_pretrained("distilbert-base-uncased", num_labels=15)

# Initialize the tokenizer (use the same tokenizer used for training)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Save the model, tokenizer, and config
trainer.save_model("./final_model")  # Save the model
tokenizer.save_pretrained("./final_model")  # Save the tokenizer
config.save_pretrained("./final_model")  # Save the config

In [None]:
# Use the trainer to make predictions on the test dataset
predictions = trainer.predict(test_dataset)

# Get the predicted labels by taking the argmax of the logits
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Get the true labels from the test dataset (same format as predicted_labels)
true_labels = [example['labels'].item() for example in test_dataset]  # Extract the labels as integers

# Generate and print the classification report
print(classification_report(true_labels, predicted_labels))

In [None]:
for i, (true, pred) in enumerate(zip(true_labels, predicted_labels)):
    if true != pred:
        # Retrieve the text for the misclassified example
        text = test_texts[i]
        
        print(f"Example {i}:")
        print(f"Text: {text}")
        print(f"True Label: {true}, Predicted Label: {pred}")

{0: 'add dependency', 1: 'external documentation', 2: 'input data', 3: 'internal documentation', 4: 'model structure', 5: 'output data', 6: 'parameter tuning', 7: 'pipeline performance', 8: 'preprocessing', 9: 'project metadata', 10: 'remove dependency', 11: 'sharing', 12: 'training infrastructure', 13: 'update dependency', 14: 'validation infrastructure'}


In [7]:
# Hugging Face Prediction


import torch
import pandas as pd
import re
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Detect if Apple M1/M2/M3 chip is available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")  # Will print 'mps' if available, otherwise 'cpu'

# Load the trained model and tokenizer
model_path = "./final_model"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.to(device)  # Move model to CPU or MPS (Mac GPU)
model.eval()  # Set model to evaluation mode

# Load dataset
HF = pd.read_csv("dataset/HF_commit_986.csv")
HF['combine_message'] = HF['title'] + ' ' + HF['message'].fillna(HF['title'])

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

HF['combine_message'] = HF['combine_message'].apply(clean_text)
HF = HF[HF['combine_message'].notna() & (HF['combine_message'] != '') & (HF['combine_message'].str.split().str.len() > 1)]

# Define batch size (set lower for MacBook to avoid crashes)
BATCH_SIZE = 128  

# Tokenize dataset in batches
def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create DataLoader for batching
def create_dataloader(texts, batch_size=BATCH_SIZE):
    inputs = tokenize_batch(texts)
    dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
    return DataLoader(dataset, batch_size=batch_size)

# Predict function
def predict_labels(texts):
    dataloader = create_dataloader(texts)
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_predictions.extend(predictions)

    return all_predictions

# Run predictions in batches
HF["predicted_label"] = predict_labels(HF["combine_message"].tolist())
HF["predicted_label"] = HF["predicted_label"].map(label_mapping)


# Save results
HF.to_csv("HF_commit_986_predictions.csv", index=False)

print("✅ Predictions saved successfully!")


Using device: mps
✅ Predictions saved successfully!


In [10]:
# GitHub Prediction

import torch
import pandas as pd
import re
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Detect if Apple M1/M2/M3 chip is available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")  # Will print 'mps' if available, otherwise 'cpu'

# Load the trained model and tokenizer
model_path = "./final_model"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.to(device)  # Move model to CPU or MPS (Mac GPU)
model.eval()  # Set model to evaluation mode

# Load dataset
GH = pd.read_csv("dataset/GitHub_commits.csv")
#HF['combine_message'] = HF['title'] + ' ' + HF['message'].fillna(HF['title'])

# Function to clean text
def clean_text(text):
    if not isinstance(text, str):  # Ensure text is a string
        return ""
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

GH['commit_message'] = GH['commit_message'].apply(clean_text)
GH = GH[GH['commit_message'].notna() & (GH['commit_message'] != '') & (GH['commit_message'].str.split().str.len() > 1)]

# Define batch size (set lower for MacBook to avoid crashes)
BATCH_SIZE = 128  

# Tokenize dataset in batches
def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create DataLoader for batching
def create_dataloader(texts, batch_size=BATCH_SIZE):
    inputs = tokenize_batch(texts)
    dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
    return DataLoader(dataset, batch_size=batch_size)

# Predict function
def predict_labels(texts):
    dataloader = create_dataloader(texts)
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_predictions.extend(predictions)

    return all_predictions

# Run predictions in batches
GH["predicted_label"] = predict_labels(GH["commit_message"].tolist())
GH["predicted_label"] = GH["predicted_label"].map(label_mapping)


# Save results
GH.to_csv("GH_commit_predictions.csv", index=False)

print("✅ Predictions saved successfully!")


Using device: mps
✅ Predictions saved successfully!


In [11]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
from huggingface_hub import create_repo

repo_name = "distilbert-base-uncased-commit_labeller"  # Change this to your preferred repository name
create_repo(repo_name)

RepoUrl('https://huggingface.co/eyinlojuoluwa/distilbert-base-uncased-commit_labeller', endpoint='https://huggingface.co', repo_type='model', repo_id='eyinlojuoluwa/distilbert-base-uncased-commit_labeller')

In [13]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments

# Replace these with your actual paths
model_dir = "./final_model"

# Push model, tokenizer, and config to Hugging Face Hub
model = DistilBertForSequenceClassification.from_pretrained(model_dir)
tokenizer = DistilBertTokenizer.from_pretrained(model_dir)

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/eyinlojuoluwa/distilbert-base-uncased-commit_labeller/commit/3b383bd64f5eeef24b843e255c8bcd080e85cbaa', commit_message='Upload tokenizer', commit_description='', oid='3b383bd64f5eeef24b843e255c8bcd080e85cbaa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/eyinlojuoluwa/distilbert-base-uncased-commit_labeller', endpoint='https://huggingface.co', repo_type='model', repo_id='eyinlojuoluwa/distilbert-base-uncased-commit_labeller'), pr_revision=None, pr_num=None)

In [14]:
config = model.config
config.push_to_hub(repo_name)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/eyinlojuoluwa/distilbert-base-uncased-commit_labeller/commit/3b383bd64f5eeef24b843e255c8bcd080e85cbaa', commit_message='Upload config', commit_description='', oid='3b383bd64f5eeef24b843e255c8bcd080e85cbaa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/eyinlojuoluwa/distilbert-base-uncased-commit_labeller', endpoint='https://huggingface.co', repo_type='model', repo_id='eyinlojuoluwa/distilbert-base-uncased-commit_labeller'), pr_revision=None, pr_num=None)

In [15]:
label_map = {
    0: 'add dependency',
    1: 'external documentation',
    2: 'input data',
    3: 'internal documentation',
    4: 'model structure',
    5: 'output data',
    6: 'parameter tuning',
    7: 'pipeline performance',
    8: 'preprocessing',
    9: 'project metadata',
    10: 'remove dependency',
    11: 'sharing',
    12: 'training infrastructure',
    13: 'update dependency',
    14: 'validation infrastructure'
}

# Create the mappings for id2label and label2id
id2label = {i: label for i, label in label_map.items()}
label2id = {label: i for i, label in label_map.items()}

In [16]:
from transformers import DistilBertConfig

# Load the model configuration
config = DistilBertConfig.from_pretrained("distilbert-base-uncased")

# Set the label mappings
config.id2label = id2label
config.label2id = label2id

# Save the updated config file to the model directory
config.save_pretrained("./final_model")

In [17]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments

# Replace these with your actual paths
model_dir = "./final_model"

# Push model, tokenizer, and config to Hugging Face Hub
model = DistilBertForSequenceClassification.from_pretrained(model_dir)
tokenizer = DistilBertTokenizer.from_pretrained(model_dir)

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/eyinlojuoluwa/distilbert-base-uncased-commit_labeller/commit/d86fe1d49f9303b8c11043adbca4c6ab79f53592', commit_message='Upload tokenizer', commit_description='', oid='d86fe1d49f9303b8c11043adbca4c6ab79f53592', pr_url=None, repo_url=RepoUrl('https://huggingface.co/eyinlojuoluwa/distilbert-base-uncased-commit_labeller', endpoint='https://huggingface.co', repo_type='model', repo_id='eyinlojuoluwa/distilbert-base-uncased-commit_labeller'), pr_revision=None, pr_num=None)