In [1]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import re
from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertForSequenceClassification, AutoModelForSequenceClassification, Trainer, TrainingArguments, DistilBertConfig
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
from nltk.corpus import wordnet
import random
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.nn import CrossEntropyLoss
import nltk
from transformers import DistilBertTokenizer
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("dataset/combined_GH_HF_manual.csv")

# Function to clean text columns
def clean_text(text):
    # Remove non-ASCII characters (corrupted/malformed characters)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    return text.strip()

# Apply cleaning function to relevant columns
df['message'] = df['message'].apply(clean_text)
df = df[df['message'].notna() & (df['message'] != '') & (df['message'].str.split().str.len() > 1)]
df["label"] = df["label"].str.lower()

number_of_labels = df["label"].value_counts()
number_of_labels

label
external documentation       501
model structure              195
project metadata             141
sharing                      127
preprocessing                 61
training infrastructure       55
validation infrastructure     52
input data                    35
internal documentation        35
pipeline performance          33
parameter tuning              31
add dependency                19
output data                   18
update dependency             17
remove dependency             15
Name: count, dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Step 1: Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(df["label"])
df['encoded_labels'] = label_encoder.fit_transform(df['label'])
label_mapping = {index: label for index, label in enumerate(label_encoder.classes_)}
print(label_mapping)

# Step 2: Prepare texts and labels
data_texts = df['message'].tolist()
data_labels = df['encoded_labels'].tolist()

# Step 3: Train/Val/Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_texts, data_labels, test_size=0.2, random_state=0, shuffle=True
)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    train_texts, train_labels, test_size=0.01, random_state=0, shuffle=True
)

print(f"Training data: {len(train_texts)}")
print(f"Validation data: {len(val_texts)}")
print(f"Test data: {len(test_texts)}")

# Step 4: Initialize tokenizer
model_name = "sentence-transformers/paraphrase-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 5: Tokenization function
def preprocess_function(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=64,  # optimized for short text
        return_tensors="pt"
    )

# Step 6: Tokenize datasets
train_encodings = preprocess_function(train_texts)
val_encodings = preprocess_function(val_texts)
test_encodings = preprocess_function(test_texts)

# Step 7: Wrap into dataset dicts
train_dataset = [{
    "input_ids": enc,
    "attention_mask": train_encodings["attention_mask"][i],
    "labels": torch.tensor(train_labels[i])
} for i, enc in enumerate(train_encodings["input_ids"])]

val_dataset = [{
    "input_ids": enc,
    "attention_mask": val_encodings["attention_mask"][i],
    "labels": torch.tensor(val_labels[i])
} for i, enc in enumerate(val_encodings["input_ids"])]

test_dataset = [{
    "input_ids": enc,
    "attention_mask": test_encodings["attention_mask"][i],
    "labels": torch.tensor(test_labels[i])
} for i, enc in enumerate(test_encodings["input_ids"])]

# Step 8: Check sample
print(train_dataset[0])
