<a href="https://colab.research.google.com/github/fachiny17/machine_learning/blob/main/dsn_inhouse_hackathon/dsn_inhouse_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2025 DSN AI Bootcamp In-House Hackathon

Visit the [kaggle link](https://www.kaggle.com/competitions/dsn-bootcamp-in-house-hackathon/overview) to view more about the contest.

In [1]:
# Install all required packages
!pip install transformers datasets sentencepiece accelerate evaluate rouge-score bert-score torchview nltk sacrebleu
!pip install --upgrade transformers datasets



In [2]:
!pip install tdqm



In [3]:
from google.colab import drive
import pandas as pd
import numpy as np
import os
import torch
import random

In [4]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

from datasets import Dataset, load_dataset
import evaluate
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [5]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
drive_path = '/content/drive/MyDrive/dsn-inhouse-hackathon-files/'

## Prepare and Load your Data

In [7]:
print("Files in the folder:")
print(os.listdir(drive_path))

Files in the folder:
['train.xlsx', 'test.xlsx', 'Submission_template.csv', 'nllb-finetuned']


In [8]:
# Load the datasets
train_df = pd.read_excel(drive_path + 'train.xlsx')
test_df = pd.read_excel(drive_path + 'test.xlsx')
sample_df = pd.read_csv(drive_path + 'Submission_template.csv')

In [9]:
train_df.head(5)

Unnamed: 0,Output,input,Language
0,"So, I find myself, over and over again, thinki...",оооооооооооооооооооооооооооооооооооооооооооооо...,Hausa
1,Especially in things where the connection to G...,"Karịsịa na ihe ebe na njikọ aka Chineke otuto,...",Igbo
2,"12 , 13 . ( a ) What is hyperbole ?\n","12 , 13 . ( a ) Kí ni àbùmọ́ ?\n",Yoruba
3,You and your story have helped me.\n,оооооооооооооооооооооооооооооооооооооооооооооо...,Hausa
4,CAUSE ALL PEOPLE TO BE TREATED EQUALLY,,Igbo


In [10]:
test_df.head()

Unnamed: 0,Competition_ID,Input Text,Language
0,IGB001,Onye ọ bụla tukotara ego iji fu na emeziri ihe...,Igbo
1,IGB002,Anyị bughariri ọrụ ụgwọ metara nile iji debe i...,Igbo
2,IGB003,Emeputara obere akwụkwọ ndekọ ka anyị wee nwee...,Igbo
3,IGB004,Anyị kwekọrịtara ka onye ọ bụla kwụọ ụgwọ ọnụ ...,Igbo
4,IGB005,Echetaram ha na-ntunye ụtụ imezi ihe nke ọma n...,Igbo


In [11]:
sample_df.head()

Unnamed: 0,ID,Output text
0,IGB001,
1,IGB002,
2,IGB003,
3,IGB004,
4,IGB005,


In [12]:
# Set random seeds for reprducibility
def set_seed(seed=42):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)

set_seed(42)

## Step 4: Explore the Data

In [13]:
print("📊 DATA EXPLORATION")
print("=" * 50)

print("\nTraining Data Info:")
print(f"Shape: {train_df.shape}")
print(f"Columns: {list(train_df.columns)}")
print(f"\nMissing values:")
print(train_df.isnull().sum())

print(f"\nLanguage Distribution in Training:")
print(train_df['Language'].value_counts())

print(f"\nLanguage Distribution in Test:")
print(test_df['Language'].value_counts())

print("\nSample training examples:")
for i in range(2):
    lang = train_df['Language'].iloc[i]
    print(f"\n{lang.upper()}:")
    print(f"Source: {train_df['input'].iloc[i]}")
    print(f"Target: {train_df['Output'].iloc[i]}")

📊 DATA EXPLORATION

Training Data Info:
Shape: (135000, 3)
Columns: ['Output', 'input', 'Language']

Missing values:
Output      1196
input        266
Language       0
dtype: int64

Language Distribution in Training:
Language
Yoruba    45055
Igbo      45001
Hausa     44944
Name: count, dtype: int64

Language Distribution in Test:
Language
Hausa     229
Yoruba    200
Igbo      168
Name: count, dtype: int64

Sample training examples:

HAUSA:
Source: оооооооооооооооооооооооооооооооооооооооооооооооооооооооовввввввввввввввввввввввв

Target: So, I find myself, over and over again, thinking about my German mother.


IGBO:
Source: Karịsịa na ihe ebe na njikọ aka Chineke otuto, ọ dịghị ka o doo anya.
Target: Especially in things where the connection to God's glory isn't as clear.


## Step 5: Simple Data Augmentation

In [14]:
print("🔄 Applying data augmentation...")

def simple_augmentation(df, num_augments=1):
    """Simple data augmentation by creating variations"""
    augmented_rows = []

    # Drop rows with missing values in 'input' or 'Output' columns
    df_cleaned = df.dropna(subset=['input', 'Output']).copy()


    for _, row in tqdm(df_cleaned.iterrows(), total=len(df_cleaned)):
        source_text = row['input']
        target_text = row['Output']
        lang = row['Language']

        # Keep original
        augmented_rows.append({
            #'ID': row['ID'],
            'input': source_text,
            'Output': target_text,
            'Language': lang
        })

        # Create simple variations
        for aug_idx in range(num_augments):
            # Simple word shuffle for augmentation
            words_source = source_text.split()
            words_target = target_text.split()

            if len(words_source) > 3 and len(words_target) > 3:
                # Shuffle words (simple augmentation)
                np.random.shuffle(words_source)
                np.random.shuffle(words_target)

                aug_source = ' '.join(words_source)
                aug_target = ' '.join(words_target)

                augmented_rows.append({
                    #'ID': f"aug_{row['ID']}_{aug_idx}",
                    'input': aug_source,
                    'Output': aug_target,
                    'Language': lang
                })


    return pd.DataFrame(augmented_rows)

# Apply augmentation
original_size = len(train_df)
augmented_train_df = simple_augmentation(train_df, num_augments=1)
print(f"✅ Data augmentation complete!")
print(f"Original size: {original_size}")
print(f"Augmented size: {len(augmented_train_df)}")

🔄 Applying data augmentation...


100%|██████████| 133538/133538 [00:12<00:00, 11097.42it/s]


✅ Data augmentation complete!
Original size: 135000
Augmented size: 225207


## Step 6: Initialize NLLB Model

In [15]:
# Model configuration - using the distilled version for faster training
MODEL_NAME = "facebook/nllb-200-distilled-600M"

# Language mapping for NLLB
LANG_MAPPING = {
    'yoruba': 'yor_Latn',
    'igbo': 'ibo_Latn',
    'hausa': 'hau_Latn',
    'english': 'eng_Latn'
}

print(f"🚀 Loading model: {MODEL_NAME}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print("✅ Model loaded successfully!")
print(f"Model parameters: {model.num_parameters():,}")

# Check GPU and move model to device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    model = model.to(device)

🚀 Loading model: facebook/nllb-200-distilled-600M
✅ Model loaded successfully!
Model parameters: 615,073,792
Using device: cuda
GPU memory: 15.8 GB


### Cleaning the data

In [16]:
# Let's clean the data before preprocessing
print("🧹 Cleaning data...")

def clean_dataframe(df):
    """Clean the dataframe by handling missing values"""
    df_clean = df.copy()

    # Fill missing values
    df_clean['input'] = df_clean['input'].fillna('')
    df_clean['Output'] = df_clean['Output'].fillna('')
    df_clean['Language'] = df_clean['Language'].fillna('yoruba')

    # Convert to string
    df_clean['input'] = df_clean['input'].astype(str)
    df_clean['Output'] = df_clean['Output'].astype(str)
    df_clean['Language'] = df_clean['Language'].astype(str)

    # Remove empty strings
    df_clean = df_clean[df_clean['input'].str.strip() != '']
    df_clean = df_clean[df_clean['Output'].str.strip() != '']

    return df_clean

# Clean the augmented data
cleaned_train_df = clean_dataframe(augmented_train_df)
print(f"✅ Data cleaned! Remaining samples: {len(cleaned_train_df)}")

# Check cleaned data
print("\n📊 CLEANED DATA INFO:")
print(f"Missing values: {cleaned_train_df.isnull().sum().sum()}")
print(f"Sample input: {cleaned_train_df['input'].iloc[0][:100]}...")
print(f"Sample Output: {cleaned_train_df['Output'].iloc[0][:100]}...")

🧹 Cleaning data...
✅ Data cleaned! Remaining samples: 223806

📊 CLEANED DATA INFO:
Missing values: 0
Sample input: оооооооооооооооооооооооооооооооооооооооооооооооооооооооовввввввввввввввввввввввв
...
Sample Output: So, I find myself, over and over again, thinking about my German mother.
...


## Step 7: Data Preprocessing

In [17]:
def preprocess_function(examples):
    """Preprocess data for the NLLB model using cleaned DataFrame."""

    # Handle batched input (examples is a dict of lists)
    inputs = examples["input"]
    targets = examples["Output"]

    # Tokenize input texts
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Tokenize target texts (English)
    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


print("🔄 Preprocessing cleaned data...")

# Convert cleaned data to Hugging Face dataset
train_dataset = Dataset.from_pandas(cleaned_train_df)

# ✅ Remove unnecessary columns that could confuse the tokenizer
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["input", "Output"]]
)

# ✅ Now map with batching
train_dataset = train_dataset.map(preprocess_function, batched=True, batch_size=1000)

print("✅ Data preprocessing complete!")
print(f"Training samples: {len(train_dataset)}")


🔄 Preprocessing cleaned data...


Map:   0%|          | 0/223806 [00:00<?, ? examples/s]

✅ Data preprocessing complete!
Training samples: 223806


## Step 8: Training Configuration

In [18]:
# Create output directory
import os
output_dir = "/content/drive/MyDrive/dsn-inhouse-hackathon-files/nllb-finetuned"
os.makedirs(output_dir, exist_ok=True)

# Training arguments optimized for NLLB
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    eval_strategy="no",  # Simple training without validation split
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True if device == "cuda" else False,
    logging_steps=50,
    warmup_steps=100,
    report_to="none",
    dataloader_pin_memory=False,
    remove_unused_columns=False,  # Important for custom datasets
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print("✅ Training configuration set up!")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Training epochs: {training_args.num_train_epochs}")
print(f"Learning rate: {training_args.learning_rate}")

✅ Training configuration set up!
Batch size: 4
Training epochs: 3
Learning rate: 3e-05


In [19]:
from datasets import Dataset

# Rebuild the dataset with correct preprocessing
train_dataset = Dataset.from_pandas(cleaned_train_df)

# Apply preprocessing function
train_dataset = train_dataset.map(preprocess_function, batched=True)

# Remove original text columns and the index column after preprocessing
columns_to_remove = ['input', 'Output', 'Language']
if '__index_level_0__' in train_dataset.column_names:
    columns_to_remove.append('__index_level_0__')

train_dataset = train_dataset.remove_columns(columns_to_remove)


print("✅ Data preprocessing complete!")
print(f"Training samples: {len(train_dataset)}")
print(f"Remaining columns: {train_dataset.column_names}")

Map:   0%|          | 0/223806 [00:00<?, ? examples/s]

✅ Data preprocessing complete!
Training samples: 223806
Remaining columns: ['input_ids', 'attention_mask', 'labels']


In [20]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

## Step 9: Model Training

In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.gradient_checkpointing_enable()

In [None]:
# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("🎯 Starting model training...")
print("This will take 20-60 minutes depending on your GPU")

# Start training
training_results = trainer.train()

print("✅ Training completed!")
print(f"Final training loss: {training_results.metrics['train_loss']:.4f}")

# Save the fine-tuned model
trainer.save_model()
tokenizer.save_pretrained(output_dir)
print("💾 Model saved successfully!")

🎯 Starting model training...
This will take 20-60 minutes depending on your GPU


`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`...


Step,Training Loss
50,11.7105
100,9.6046
150,7.0026
200,5.5686
250,4.5108
300,3.6477
350,2.7046
400,1.9172
450,1.2882
500,0.8355
