In [1]:
# 476e23d9bf3a3da88d6b8f692a445705bac2097b
# key

In [2]:
# Check if GPU is available (highly recommended for faster training)
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU. Training might be slow.")

# 1. Install necessary libraries
!pip install -qq transformers datasets accelerate scikit-learn pandas tabulate

# Imports
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import pandas as pd
import numpy as np
from tabulate import tabulate # Import tabulate for pretty printing

# Set a seed for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

Using CPU. Training might be slow.



In [3]:
print("\n--- 1. Loading and Preparing Dataset ---")
# 2. Load Dataset
ds = load_dataset("infinite-dataset-hub/CorporateMailCategorization")

# Convert to pandas DataFrame for easier null handling
df = ds["train"].to_pandas()


--- 1. Loading and Preparing Dataset ---


README.md: 0.00B [00:00, ?B/s]

data.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [4]:
# 3. Drop Null Rows
initial_rows = len(df)
df_cleaned = df.dropna(subset=['label']).copy()
rows_dropped = initial_rows - len(df_cleaned)
print(f"\nDropped {rows_dropped} rows with null 'label' values.")
print(f"Remaining rows after dropping nulls: {len(df_cleaned)}")


Dropped 10 rows with null 'label' values.
Remaining rows after dropping nulls: 90


In [5]:
df_cleaned

Unnamed: 0,idx,text,label
0,0,"Dear Team, I wanted to discuss the upcoming pr...",Product Launch
1,1,"Good morning, I need the quarterly financial r...",Financial Report
2,2,I am pleased to announce that we have successf...,Merger Announcement
3,3,Please review the attached employee satisfacti...,Employee Feedback
4,4,Our new sustainability initiatives are ready f...,Sustainability Initiative
...,...,...,...
94,94,We need to prioritize eco-friendly options in ...,Sustainability Initiative
96,96,This financial report outlines our budget allo...,Budget Report
97,97,Important: We have merged with PQR Ltd. Prepar...,Merger Announcement
98,98,The training sessions on data privacy and secu...,Employee Feedback


In [6]:
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset

label_counts = df_cleaned['label'].value_counts()
label_counts
# # 2. Create the pie chart
# plt.figure(figsize=(8, 8)) # Set the figure size for better visualization
# plt.pie(
#     label_counts,
#     labels=label_counts.index, # Use the category names as labels
#     autopct='%1.1f%%',         # Display percentages on the slices
#     startangle=90              # Start the first slice at the top
# )
# plt.title('Distribution of Email Categories') # Add a title to the chart
# plt.axis('equal') # Ensures the pie chart is drawn as a perfect circle
# plt.show() # Display the chart

label
Employee Feedback               21
Merger Announcement             21
Sustainability Initiative       21
Financial Report                12
Product Launch                   6
Financial Projections            1
Financial Performance            1
Preliminary Financial Report     1
Audit Request                    1
Financial Analysis               1
Profit Analysis                  1
Summary Financial Report         1
Financial Health                 1
Budget Report                    1
Name: count, dtype: int64

In [7]:
# Important: Map string labels to integers for the model
# Get unique labels from the cleaned dataset
unique_labels = sorted(df_cleaned['label'].unique().tolist())
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for i, label in enumerate(unique_labels)}

print(f"\nDetected labels and their mappings: {label_to_id}")
num_labels = len(unique_labels)
print(f"Number of unique labels: {num_labels}")

# Apply label mapping to the cleaned dataset
def map_labels_to_ids(example):
    example['label'] = label_to_id[example['label']]
    return example

# Convert cleaned DataFrame to Hugging Face Dataset
full_labeled_ds = Dataset.from_pandas(df_cleaned).map(map_labels_to_ids)


Detected labels and their mappings: {'Audit Request': 0, 'Budget Report': 1, 'Employee Feedback': 2, 'Financial Analysis': 3, 'Financial Health': 4, 'Financial Performance': 5, 'Financial Projections': 6, 'Financial Report': 7, 'Merger Announcement': 8, 'Preliminary Financial Report': 9, 'Product Launch': 10, 'Profit Analysis': 11, 'Summary Financial Report': 12, 'Sustainability Initiative': 13}
Number of unique labels: 14


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [8]:
train_val_test_split = full_labeled_ds.train_test_split(test_size=0.1, seed=42)
train_val_ds = train_val_test_split["train"]
test_ds_final = train_val_test_split["test"] # This is our final, labeled test set

train_val_split_ratio = 0.88888888888
train_val_split = train_val_ds.train_test_split(test_size=1-train_val_split_ratio, seed=42)
train_ds_split = train_val_split["train"]
eval_ds_split = train_val_split["test"] # This is our validation set

print(f"\nDataset Splits:")
print(f"  Training samples: {len(train_ds_split)}")
print(f"  Validation samples: {len(eval_ds_split)}")
print(f"  Final Test samples: {len(test_ds_final)}")

print("\nTrain Dataset Split Structure:", train_ds_split)
print("Validation Dataset Split Structure:", eval_ds_split)
print("Final Test Dataset Structure:", test_ds_final)


Dataset Splits:
  Training samples: 71
  Validation samples: 10
  Final Test samples: 9

Train Dataset Split Structure: Dataset({
    features: ['idx', 'text', 'label', '__index_level_0__'],
    num_rows: 71
})
Validation Dataset Split Structure: Dataset({
    features: ['idx', 'text', 'label', '__index_level_0__'],
    num_rows: 10
})
Final Test Dataset Structure: Dataset({
    features: ['idx', 'text', 'label', '__index_level_0__'],
    num_rows: 9
})


In [9]:
print("\n--- 2. Loading Tokenizer and Model ---")
# 5. Load Tokenizer and Model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Move model to GPU if available
model.to(device)



--- 2. Loading Tokenizer and Model ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
print("\n--- 3. Tokenizing Data ---")
# 6. Tokenize Data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_train_ds = train_ds_split.map(tokenize_function, batched=True)
tokenized_eval_ds = eval_ds_split.map(tokenize_function, batched=True)
tokenized_test_ds_final = test_ds_final.map(tokenize_function, batched=True) # Tokenize final test set

# Remove original text and idx columns as they are no longer needed for training/evaluation
tokenized_train_ds = tokenized_train_ds.remove_columns(["text", "idx"])
tokenized_eval_ds = tokenized_eval_ds.remove_columns(["text", "idx"])
tokenized_test_ds_final = tokenized_test_ds_final.remove_columns(["text", "idx"]) # Keep label for final test


print("\n--- 4. Defining Metrics ---")


--- 3. Tokenizing Data ---


Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]


--- 4. Defining Metrics ---


In [11]:
# 7. Define Metrics
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    true_labels = np.array(p.label_ids).flatten()

    accuracy = accuracy_score(true_labels, predictions)
    f1_weighted = f1_score(true_labels, predictions, average='weighted')
    # You can add more metrics like precision, recall if needed
    # precision, recall, fscore, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

    metrics = {
        'accuracy': accuracy,
        'f1_weighted': f1_weighted,
        # 'precision_weighted': precision,
        # 'recall_weighted': recall,
    }
    return metrics


print("\n--- 5. Setting up Training Arguments ---")


--- 5. Setting up Training Arguments ---


In [12]:
# 8. Training Arguments
training_args = TrainingArguments(
    output_dir="./results_mail_category", # Directory for logs and checkpoints
    num_train_epochs=5,                  # Number of training epochs
    per_device_train_batch_size=16,      # Batch size for training
    per_device_eval_batch_size=16,       # Batch size for evaluation
    warmup_steps=10,                     # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                   # Strength of weight decay
    logging_dir="./logs_mail_category",  # Directory for storing logs
    logging_strategy="epoch",            # Log metrics at the end of each epoch
    save_strategy="epoch",               # Save model at the end of each epoch
    eval_strategy="epoch",         # Evaluate at the end of each epoch
    load_best_model_at_end=True,         # Load the best model at the end of training
    metric_for_best_model="f1_weighted", # Metric to use to compare models
    report_to="none",                    # Don't report to any online services
)


print("\n--- 6. Initializing and Running Trainer ---")
# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_eval_ds,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# Start training!
trainer.train()

print("\n--- 7. Evaluation Metrics on Validation Set ---")
eval_results = trainer.evaluate()
print("\nEvaluation Metrics on Validation Set (used during training):")
eval_df = pd.DataFrame([eval_results]).transpose()
eval_df.columns = ['Value']
print(tabulate(eval_df, headers='keys', tablefmt='grid', floatfmt=".4f"))

  trainer = Trainer(



--- 6. Initializing and Running Trainer ---


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,2.6164,2.551044,0.2,0.066667
2,2.4637,2.254011,0.3,0.213333
3,2.1996,1.973029,0.6,0.5
4,2.0431,1.810151,0.7,0.651667
5,1.8645,1.746989,0.8,0.750476



--- 7. Evaluation Metrics on Validation Set ---



Evaluation Metrics on Validation Set (used during training):
+-------------------------+---------+
|                         |   Value |
| eval_loss               |  1.7470 |
+-------------------------+---------+
| eval_accuracy           |  0.8000 |
+-------------------------+---------+
| eval_f1_weighted        |  0.7505 |
+-------------------------+---------+
| eval_runtime            |  0.9747 |
+-------------------------+---------+
| eval_samples_per_second | 10.2590 |
+-------------------------+---------+
| eval_steps_per_second   |  1.0260 |
+-------------------------+---------+
| epoch                   |  5.0000 |
+-------------------------+---------+


In [13]:
print("\n--- 8. Saving the Fine-tuned Model ---")
# 10. Save the model in the specified directory
save_path = "./mail_category"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"\nModel and tokenizer saved to: {save_path}")


--- 8. Saving the Fine-tuned Model ---

Model and tokenizer saved to: ./mail_category


In [14]:
print("\n--- 9. Performing Inference and Evaluation on Final Test Set ---")
# Load the saved model to ensure we are using the fine-tuned one
loaded_tokenizer = AutoTokenizer.from_pretrained(save_path)
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_path, num_labels=num_labels)
loaded_model.to(device) # Move to GPU if available

# Use the loaded model for prediction on the FINAL test set
final_test_trainer = Trainer(model=loaded_model, tokenizer=loaded_tokenizer)

# Predict on the tokenized final test dataset
predictions_final_test = final_test_trainer.predict(tokenized_test_ds_final)

# Get predicted class IDs and confidence scores
predicted_ids_final_test = np.argmax(predictions_final_test.predictions, axis=1)
confidence_scores_final_test = np.max(torch.softmax(torch.tensor(predictions_final_test.predictions), dim=1).numpy(), axis=1)

# Get actual labels from the original (non-tokenized) test_ds_final
actual_labels_final_test = [id_to_label[label_id] for label_id in test_ds_final['label']]

# Map predicted IDs back to original labels
predicted_labels_final_test = [id_to_label[id_val] for id_val in predicted_ids_final_test]

# Create a DataFrame for tabular output of actual vs. predicted
final_results_df = pd.DataFrame({
    'text': test_ds_final['text'],
    'actual_label': actual_labels_final_test,
    'predicted_label': predicted_labels_final_test,
    'confidence': confidence_scores_final_test
}).reset_index(drop=True)

print("\n--- Actual vs. Predicted Categories for Final Test Samples ---")
print(tabulate(final_results_df, headers='keys', tablefmt='grid', floatfmt=".4f"))

# Calculate and print metrics for the final test set
final_test_metrics = compute_metrics(predictions_final_test)
print("\nEvaluation Metrics on Final Test Set:")
final_test_metrics_df = pd.DataFrame([final_test_metrics]).transpose()
final_test_metrics_df.columns = ['Value']
print(tabulate(final_test_metrics_df, headers='keys', tablefmt='grid', floatfmt=".4f"))


print("\n--- Fine-tuning process complete! ---")


--- 9. Performing Inference and Evaluation on Final Test Set ---


  final_test_trainer = Trainer(model=loaded_model, tokenizer=loaded_tokenizer)



--- Actual vs. Predicted Categories for Final Test Samples ---
+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------+---------------------------+--------------+
|    | text                                                                                                                                                                         | actual_label              | predicted_label           |   confidence |
|  0 | Important: All department heads must review and implement the new sustainability guidelines. Our planet's health depends on our actions. Best, Daniel Rodriguez              | Sustainability Initiative | Sustainability Initiative |       0.2206 |
+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------