In [None]:
# Step 1: Import libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
import tensorflow as tf
import torch

In [None]:
# Step 2: Load the data
file_path = "C:\\Users\\dilaw\\OneDrive\\Desktop\\multiclass SUBCAT.csv"
df = pd.read_csv(file_path, header=None, names=['class', 'description'])

In [None]:
# Step 3: Remove NaN values and duplicates
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
# Step 4: Data Cleaning
df['description'] = df['description'].str.lower()


In [None]:
data['word_count'] = data['description'].str.split().str.len()
plt.figure(figsize=(10, 6))
plt.hist(data['word_count'], bins=range(1, data['word_count'].max()+1), alpha=0.7, color='blue')
plt.title('Distribution of Number of Words')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.xticks(range(1, data['word_count'].max()+1))
plt.show()

In [None]:
# Step 5: Encode the class labels
label_encoder = LabelEncoder()
df['encoded_class'] = label_encoder.fit_transform(df['class'])

In [None]:
df.head()

In [None]:
# Step 6: Prepare the data
data_texts = df['description'].tolist()
data_labels = df['encoded_class'].tolist()

In [None]:
# Step7: Splitting data
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0)

In [None]:
#Step8: Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [None]:
# Step9: Convert to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))


In [None]:
#Step10:  Model and Training Setup
training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.00001,
    logging_dir='./logs',
    eval_steps=100
)

In [None]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=207)

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# Step11: Train and Evaluate the Model
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Step12: Save the Model and Tokenizer
save_directory = "/content/saved_models"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

In [None]:
# Step13: Load the fine-tuned model and tokenizer for inference
model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained(save_directory)
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(save_directory)
test_text = test_texts[0]

In [None]:
# Step14: Make a prediction
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation=True,
    padding=True,
    return_tensors='tf'
)

In [None]:
output = model_fine_tuned(predict_input)[0]
prediction_value = tf.argmax(output, axis=1).numpy()[0]


In [None]:
print("Predicted class index:", prediction_value)