In [2]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
df = pd.read_csv("dataset/combined_GH_HF_manual.csv")

# Function to clean text columns
def clean_text(text):
    # Remove non-ASCII characters (corrupted/malformed characters)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

# Apply cleaning function to relevant columns
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].notna() & (df['message'] != '') & (df['message'].str.split().str.len() > 1)]
df["label"] = df["label"].str.lower()

number_of_labels = df["label"].value_counts()
number_of_labels

label
external documentation       501
model structure              195
project metadata             141
sharing                      127
preprocessing                 61
training infrastructure       55
validation infrastructure     52
input data                    35
internal documentation        35
pipeline performance          33
parameter tuning              31
add dependency                19
output data                   18
update dependency             17
remove dependency             15
Name: count, dtype: int64

In [4]:
# Split the dataset into 80% training and 20% testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the Pandas DataFrames into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

  if _pandas_api.is_sparse(col):


In [5]:
train_dataset

Dataset({
    features: ['owner', 'message', 'label', '__index_level_0__'],
    num_rows: 1068
})

In [6]:
test_dataset

Dataset({
    features: ['owner', 'message', 'label', '__index_level_0__'],
    num_rows: 267
})

In [5]:
model_name = "distilbert-base-uncased"  # Using distilbert instead of bert
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

# Apply tokenization to the training and test datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1068 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

In [9]:
train_dataset

Dataset({
    features: ['owner', 'message', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 1068
})

In [10]:
test_dataset

Dataset({
    features: ['owner', 'message', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 267
})

In [7]:
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the training labels (only the 'label' column)
label_encoder.fit(train_dataset['label'])

# Apply the transformation to both train and test datasets
train_dataset = train_dataset.map(lambda e: {'label': label_encoder.transform([e['label']])[0]})
test_dataset = test_dataset.map(lambda e: {'label': label_encoder.transform([e['label']])[0]})

Map:   0%|          | 0/1068 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

In [8]:
print(train_dataset[0])  # Print the first example from the training set
print(test_dataset[0]) 

{'owner': 'THUDM', 'message': 'initial commit', 'label': 9, '__index_level_0__': 1326, 'input_ids': [101, 3988, 10797, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Keep only the best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # Lower loss is better
    learning_rate= 1e-5,  # Lower learning rate to improve stability
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,  # Reduce epochs to prevent overfitting
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_steps=1000,  # Log less frequently to reduce clutter
    report_to="none",  # Prevents reporting to WandB, TensorBoard, etc.
    fp16=False,  # Disable mixed precision to avoid MPS issue
    warmup_ratio=0.1,  # Warm-up for the first 10% of training
    lr_scheduler_type="linear",
    gradient_accumulation_steps=2,  # Helps stabilize training
)



In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # ✅ Now it is properly defined
    loss_fn=torch.nn.CrossEntropyLoss(weight=class_weights)  # Apply class weights
)
trainer.train()
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

NameError: name 'class_weights' is not defined

In [None]:
trainer.evaluate()

In [None]:
# Make predictions on the evaluation dataset
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_dataset['label'], predicted_labels)
print(f"Test accuracy: {accuracy}")

# Print the classification report for a more detailed evaluation
print(classification_report(test_dataset['label'], predicted_labels, target_names=label_encoder.classes_))

In [1]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

df = pd.read_csv("dataset/combined_GH_HF_manual.csv")

# Function to clean text columns
def clean_text(text):
    # Remove non-ASCII characters (corrupted/malformed characters)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

# Apply cleaning function to relevant columns
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].notna() & (df['message'] != '') & (df['message'].str.split().str.len() > 1)]

number_of_labels = df["label"].value_counts()

# Split the dataset into 80% training and 20% testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the Pandas DataFrames into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = "distilbert-base-uncased"  # Using distilbert instead of bert
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

# Apply tokenization to the training and test datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

label_encoder = LabelEncoder()

# Fit the LabelEncoder on the training labels (only the 'label' column)
label_encoder.fit(train_dataset['label'])

# Apply the transformation to both train and test datasets
train_dataset = train_dataset.map(lambda e: {'label': label_encoder.transform([e['label']])[0]})
test_dataset = test_dataset.map(lambda e: {'label': label_encoder.transform([e['label']])[0]})

training_args = TrainingArguments(
    output_dir="./results",          # output directory
    evaluation_strategy="epoch",     # evaluate after each epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)
trainer.train()

trainer.evaluate()

# Make predictions on the evaluation dataset
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_dataset['label'], predicted_labels)
print(f"Test accuracy: {accuracy}")

# Print the classification report for a more detailed evaluation
print(classification_report(test_dataset['label'], predicted_labels, target_names=label_encoder.classes_))

  if _pandas_api.is_sparse(col):
python(98843) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Map:   0%|          | 0/1068 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Map:   0%|          | 0/1068 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.599397


KeyboardInterrupt: 

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

df = pd.read_csv("dataset/combined_GH_HF_manual.csv")

# Function to clean text columns
def clean_text(text):
    # Remove non-ASCII characters (corrupted/malformed characters)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

# Apply cleaning function to relevant columns
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].notna() & (df['message'] != '') & (df['message'].str.split().str.len() > 1)]
df["label"] = df["label"].str.lower()

number_of_labels = df["label"].value_counts()
number_of_labels

# Split the dataset into 80% training and 20% testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the Pandas DataFrames into Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = "distilbert-base-uncased"  # Using distilbert instead of bert
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

# Apply tokenization to the training and test datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

label_encoder = LabelEncoder()

# Fit the LabelEncoder on the training labels (only the 'label' column)
label_encoder.fit(train_dataset['label'])

# Apply the transformation to both train and test datasets
train_dataset = train_dataset.map(lambda e: {'label': label_encoder.transform([e['label']])[0]})
test_dataset = test_dataset.map(lambda e: {'label': label_encoder.transform([e['label']])[0]})

training_args = TrainingArguments(
    output_dir="./results",          # output directory
    evaluation_strategy="epoch",     # evaluate after each epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # ✅ Now it is properly defined
    loss_fn=torch.nn.CrossEntropyLoss(weight=class_weights)  # Apply class weights
)
trainer.train()
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

trainer.evaluate()

# Make predictions on the evaluation dataset
predictions = trainer.predict(test_dataset)

# Convert predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(test_dataset['label'], predicted_labels)
print(f"Test accuracy: {accuracy}")

# Print the classification report for a more detailed evaluation
print(classification_report(test_dataset['label'], predicted_labels, target_names=label_encoder.classes_))

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt

df = pd.read_csv("dataset/combined_GH_HF_manual.csv")

# Function to clean text columns
def clean_text(text):
    # Remove non-ASCII characters (corrupted/malformed characters)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

# Apply cleaning function to relevant columns
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].notna() & (df['message'] != '') & (df['message'].str.split().str.len() > 1)]
df["label"] = df["label"].str.lower()

number_of_labels = df["label"].value_counts()
number_of_labels

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["label"]), 
    y=df["label"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Check class weights
print("Class Weights:", class_weights)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.map(lambda e: {'label': int(e['label'])})
test_dataset = test_dataset.map(lambda e: {'label': int(e['label'])})

class DistilBERTWithWeightedLoss(nn.Module):
    def __init__(self, model_name, num_labels, class_weights):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)  # Apply class weights

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
num_labels = len(label_encoder.classes_)  # Count unique labels
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBERTWithWeightedLoss(model_name, num_labels, class_weights)
model.to(device)  # Move model to GPU if available

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=9,
    weight_decay=0.01,
    logging_dir="./logs",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



In [None]:
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(df["label"]), 
    y=df["label"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Check class weights
print("Class Weights:", class_weights)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

class DistilBERTWithWeightedLoss(nn.Module):
    def __init__(self, model_name, num_labels, class_weights, dropout=0.4):
        super().__init__()
        
        # Create a custom configuration with modified dropout
        config = DistilBertConfig.from_pretrained(
            model_name, 
            num_labels=num_labels, 
            hidden_dropout_prob=dropout,  # Dropout for feedforward layers
            attention_probs_dropout_prob=dropout  # Dropout for attention layers
        )

        # Load the model with the custom config
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)  # Apply class weights

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}
    
device = torch.device("mps" if torch.has_mps else "cpu")
print(f"Using device: {device}")

# Initialize model
num_labels = len(label_encoder.classes_)  # Count unique labels
model = DistilBERTWithWeightedLoss(model_name, num_labels, class_weights, dropout=0.2)
model.to(device)



# Initialize model
num_labels = len(label_encoder.classes_)  # Count unique labels
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBERTWithWeightedLoss(model_name, num_labels, class_weights, dropout=0.4)
model.to(device)  # Move model to GPU if available

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,  # Keep only the best model
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  # Lower loss is better
    learning_rate=2e-5,  # Lower learning rate to improve stability
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,  # Reduce epochs to prevent overfitting
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_steps=1000,  # Log less frequently to reduce clutter
    report_to="none",  # Prevents reporting to WandB, TensorBoard, etc.
    fp16=False,  # Disable mixed precision to avoid MPS issue
    warmup_ratio=0.06,  # Gradually warm up the learning rate
    gradient_accumulation_steps=2,  # Helps stabilize training
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=1
    )
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()