In [1]:
import os
import polars as pl
input_df = pl.read_csv("../data/RTVSlo/PrometnoPorocilo2022.csv", encoding="Windows-1252")
output_df = pl.read_csv("../data/RTVSlo/Joined_rtf_files.csv", encoding="utf-8")
input_df = input_df.with_columns(pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M"))
output_df = output_df.with_columns(pl.col("Datum").str.strptime(pl.Datetime, "%m/%d/%Y %H:%M"))

In [2]:
import chardet
with open("../data/RTVSlo/PrometnoPorocilo2022.csv", "rb") as f:
    print(chardet.detect(f.read(1000000)))


{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


Fine tunning with transformers

In [3]:
import polars as pl
from datetime import datetime

def get_first_input_before_time(output_df_row, input_df):

    #'04/30/2022 18:30'
    target_datetime = dict(zip(output_df.columns, output_df_row))["Datum"]

    
    filtered = input_df.filter(
        (pl.col("Datum").dt.date() == target_datetime.date()) &
        (pl.col("Datum").dt.time() < target_datetime.time())
    )

    def print_all_rows():
        for i in range(len(filtered)):
            print(filtered.row(i))

    if filtered.height > 0:
       #print_all_rows()
       return filtered.sort("Datum").row(-1)
    else:
        return None


test = get_first_input_before_time(output_df.row(16), input_df)
print("Target row datetime:", dict(zip(output_df.columns, output_df.row(16)))["Datum"])
print("\nLast row before target time:", dict(zip(input_df.columns, test))["Datum"])


Target row datetime: 2022-04-26 13:00:00

Last row before target time: 2022-04-26 12:48:00


In [4]:
from html.parser import HTMLParser

class HTMLTextExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.text_parts = []

    def handle_data(self, data):
        self.text_parts.append(data)

    def get_text(self):
        return ''.join(self.text_parts).strip()


In [5]:
def generate_taffic_IO_examples(input_df, output_df, n_examples=1):
    traffic_examples = []
    #for i in range(len(output_df)):
    for i in range(n_examples):
        output_df_row = output_df.row(i)
        input_row = get_first_input_before_time(output_df_row, input_df)

        def strip_html(html):
            parser = HTMLTextExtractor()
            parser.feed(html)
            return parser.get_text()

        def parse_input_message(input_row, columns):
            row_dict = dict(zip(columns, input_row))
            message = ""
            for column in ["A1", "B1", "C1"]:
                value = row_dict.get(column)
                if value is not None and value != "NULL":
                    clean_value = strip_html(value)
                    message += f"{column}: {clean_value} "
            #print("Input message:", message.strip())
            return message.strip()

        
        def parse_output_message(output_row, columns):
            row_dict = dict(zip(columns, output_row))
            message = ""
            for column in ["content_01", "content_02", "content_03", "content_04", "content_05"]:
                if row_dict.get(column) is not None:
                    message += f"{row_dict[column]} "
            return message.strip()


        traffic_example = {
            "messages": [
                {"role": "user", "content": parse_input_message(input_row, input_df.columns)},
                {"role": "assistant", "content": parse_output_message(output_df_row, output_df.columns)}
            ]
        }
        traffic_examples.append(traffic_example)
    return traffic_examples

traffic_examples = generate_taffic_IO_examples(input_df, output_df, n_examples=10)
#print(traffic_examples)
#print in json format
import json
print(json.dumps(traffic_examples, indent=4, ensure_ascii=False))

[
    {
        "messages": [
            {
                "role": "user",
                "content": "B1: Nesre?eCesta Rožna Dolina - Ajševica je pri Ajševici zaprta.Cesta Pesek - Oplotnica je v Oplotnici zaprta.OpozorilaNa avtocesti od Sežane proti Mariboru, do Slovenskih Konjic, pelje izredni prevoz. Ob?asno je lahko promet oviran in upo?asnjen.Mejni prehodi?akalna doba je na mejnem prehodu Obrežje.Tovorni prometZaradi praznikov bo po Sloveniji v nedeljo, 1. 5. in v ponedeljek, 2. 5. med 8. in 22. uro, veljala omejitev prometa tovornih vozil, katerih najve?ja dovoljena masa presega 7,5 t."
            },
            {
                "role": "assistant",
                "content": "Zaradi prometne nesreče je zaprta regionalna cesta Ajševica-Rožna Dolina, in to pri Ajševici. Na mejnem prehodu Obrežje vozniki na vstop v državo čakajo do dve uri, v Gruškovju pa pol ure.  \nPovečan promet pri izstopu iz države pa je na prehodu Dobovec, na katerem vozniki čakajo uro in pol, ter na Obrež

In [6]:
from datasets import Dataset
dataset = Dataset.from_list(traffic_examples)
dataset = dataset.train_test_split(test_size=0.2)

print(dataset["train"])

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['messages'],
    num_rows: 8
})


In [7]:
# First, let's check what resources we have available
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.to("cuda")
def check_resources():
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU device: {torch.cuda.get_device_name(0)}")
        print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    
    import psutil
    print(f"CPU cores: {psutil.cpu_count(logical=False)} physical, {psutil.cpu_count()} logical")
    print(f"RAM: {psutil.virtual_memory().total / 1024**3:.2f} GB total")
    print(f"Available RAM: {psutil.virtual_memory().available / 1024**3:.2f} GB")

# Check resources before starting
check_resources()

CUDA available: True
GPU device: NVIDIA GeForce RTX 3060 Laptop GPU
GPU memory: 6.00 GB
CPU cores: 14 physical, 20 logical
RAM: 31.69 GB total
Available RAM: 15.99 GB


In [8]:
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling

def train_model(model_id, dataset):

    model = AutoModelForCausalLM.from_pretrained(model_id)
    model.to("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    def preprocess_function(examples):
        texts = []
        for conversation in examples["messages"]:
            formatted_text = ""
            for message in conversation:
                if message["role"] == "user":
                    formatted_text += f"Uporabnik: {message['content']}\n"
                else:
                    formatted_text += f"Asistent: {message['content']}\n"
            texts.append(formatted_text)
        
        tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=512)
        return tokenized
    
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=6,
        per_device_eval_batch_size=6,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=False  
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        data_collator=data_collator,
    )
    
    trainer.train()
    
    model.save_pretrained("./fine-tuned-traffic-model")
    tokenizer.save_pretrained("./fine-tuned-traffic-model")
    
    return model, tokenizer

#model_id = "cjvt/OPT_GaMS-1B-Chat"
model_id = "cjvt/OPT_GaMS-1B-Chat"
model, tokenizer = train_model(model_id, dataset)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 26.92it/s]
Map: 100%|██████████| 8/8 [00:00<00:00, 1019.86 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 584.65 examples/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 590.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 12.86 GiB is allocated by PyTorch, and 87.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [9]:
torch.cuda.empty_cache()

In [None]:
import gc
import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# First, clear memory
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Example traffic data (small sample to test with)
traffic_examples = [
    {
        "messages": [
            {"role": "user", "content": "Generiraj poročilo o prometu za avtocesto A1 med Ljubljano in Mariborom."},
            {"role": "assistant", "content": "Na avtocesti A1 med Ljubljano in Mariborom trenutno ni posebnosti. Promet poteka tekoče v obeh smereh."}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "Stanje na cestah v Ljubljani?"},
            {"role": "assistant", "content": "V Ljubljani trenutno ni večjih zastojev. Na Celovški cesti so dela, ki upočasnjujejo promet."}
        ]
    }
]

# Create a smaller dataset for initial testing
dataset = Dataset.from_list(traffic_examples)
dataset = dataset.train_test_split(test_size=0.5)  # 50/50 split for this tiny example

def train_model(model_id, dataset, device_map="auto"):
    try:
        # More careful model loading with resource management
        print(f"Loading model from {model_id}...")
        print(f"Available GPU: {torch.cuda.is_available()}, Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
        
        # Try to load with lower precision to save memory
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map=device_map,       # Distribute across available GPUs or use CPU
            torch_dtype=torch.float16,   # Use half precision to save memory
            low_cpu_mem_usage=True       # More memory-efficient loading
        )
        
        print("Model loaded successfully")
        
        # Load tokenizer (this is usually much lighter)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        print("Tokenizer loaded successfully")
        
        def preprocess_function(examples):
            texts = []
            for conversation in examples["messages"]:
                formatted_text = ""
                for message in conversation:
                    if message["role"] == "user":
                        formatted_text += f"Uporabnik: {message['content']}\n"
                    else:
                        formatted_text += f"Asistent: {message['content']}\n"
                texts.append(formatted_text)
            
            tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=512)
            return tokenized
        
        # Process the dataset
        print("Preprocessing dataset...")
        tokenized_dataset = dataset.map(preprocess_function, batched=True)
        print("Dataset preprocessed successfully")
        
        # Training arguments with lower memory footprint
        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=1,             # Start with fewer epochs for testing
            per_device_train_batch_size=1,  # Smaller batch size
            per_device_eval_batch_size=1,   # Smaller batch size
            warmup_steps=10,                # Fewer warmup steps
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=1,
            # Memory optimization settings
            gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch
            fp16=torch.cuda.is_available(), # Use FP16 if GPU available
            dataloader_num_workers=0,       # Don't use multiple workers
        )
        
        # Create data collator
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        )
        
        # Create trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            eval_dataset=tokenized_dataset["test"],
            data_collator=data_collator,
        )
        
        print("Starting training...")
        trainer.train()
        print("Training completed")
        
        # Save model
        model.save_pretrained("./fine-tuned-traffic-model")
        tokenizer.save_pretrained("./fine-tuned-traffic-model")
        
        return model, tokenizer
        
    except Exception as e:
        print(f"Error during model loading or training: {e}")
        
        # If the error is related to CUDA out of memory, try CPU
        if "CUDA out of memory" in str(e) and device_map == "auto":
            print("GPU memory error detected. Trying to load on CPU instead...")
            return train_model(model_id, dataset, device_map="cpu")
        else:
            raise e

# Try loading with smaller model first to test
try:
    print("First trying a smaller model to verify environment...")
    small_model_id = "cjvt/OPT_SloT-300M-Chat"  # Much smaller model
    test_model = AutoModelForCausalLM.from_pretrained(small_model_id, torch_dtype=torch.float16)
    print("Small model loaded successfully, environment seems functional")
    del test_model  # Free the memory
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
except Exception as e:
    print(f"Error loading test model: {e}")

# Now try the actual model
try:
    model_id = "cjvt/OPT_GaMS-1B-Chat"
    model, tokenizer = train_model(model_id, dataset)
except Exception as e:
    print(f"Failed to train model: {e}")
    print("Attempting to load model without training...")
    try:
        # Just load the model to check if that's possible
        model = AutoModelForCausalLM.from_pretrained(
            model_id, 
            device_map="cpu",
            torch_dtype=torch.float16
        )
        print("Model loaded successfully without training")
    except Exception as e2:
        print(f"Model loading also failed: {e2}")

In [None]:
from transformers import pipeline
def generate_traffic_report(prompt, model_path="./fine-tuned-traffic-model"):

    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
    
    formatted_input = f"Uporabnik: {prompt}\nAsistent:"
    
    response = generator(formatted_input, max_length=200, do_sample=True, temperature=0.7, truncation=True)
    
    assistant_response = response[0]["generated_text"].split("Asistent:")[1].strip()
    
    return assistant_response

# Example usage
prompt = "Poročaj o stanju na primorski avtocesti med Vrhniko in Koprom. Dela na cesti pri Postojni."
report = generate_traffic_report(prompt)
print(report)


In [None]:
from transformers import pipeline

def evaluate_model(model_path, test_examples):
    # Create evaluation dataset
    eval_dataset = Dataset.from_list(test_examples)
    
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Create generator
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
    
    # Evaluate on each example
    results = []
    for example in test_examples:
        user_input = example["messages"][0]["content"]
        expected_output = example["messages"][1]["content"]
        
        # Generate response
        formatted_input = f"Uporabnik: {user_input}\nAsistent:"
        response = generator(formatted_input, max_length=200)
        generated = response[0]["generated_text"].split("Asistent:")[1].strip()
        
        # Store results
        results.append({
            "input": user_input,
            "expected": expected_output,
            "generated": generated
        })
    
    return results

# Example evaluation
test_examples = [
    {
        "messages": [
            {"role": "user", "content": "Kakšno je stanje na gorenjski avtocesti proti Jesenicam?"},
            {"role": "assistant", "content": "Na gorenjski avtocesti proti Jesenicam trenutno ni posebnosti ali zastojev. Promet poteka tekoče na celotnem odseku."}
        ]
    }
]

evaluation_results = evaluate_model("./fine-tuned-traffic-model", test_examples)
print(evaluation_results)

Evaluation will then follow next procedures.: 

1. Consistent Examples in Training Data
The most straightforward approach is providing numerous consistent examples:

Include many examples with correct road naming formats
Ensure consistency across all training examples
Cover all variations of road types (highways, regional roads, streets)

For example, always using formats like:

"avtocesta A1" (not just "A1")
"regionalna cesta R2-409" (with full designation)
"Celovška cesta" (with proper capitalization)

2. Instruction-Based Fine-tuning
You can explicitly teach the model road naming rules:
pythonroad_naming_examples = [
    {
        "messages": [
            {"role": "user", "content": "Kako pravilno poimenovati avtocesto A1?"},
            {"role": "assistant", "content": "Avtocesta A1 se pravilno imenuje 'avtocesta A1' ali 'avtocesta A1 Šentilj-Srmin'. Pri prvem omenjanju v poročilu uporabite polno ime, kasneje lahko uporabite krajšo obliko 'A1'."}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "Navedi pravila za poimenovanje cest v prometnih poročilih."},
            {"role": "assistant", "content": "Pri poimenovanju cest v prometnih poročilih upoštevajte naslednja pravila:\n1. Avtoceste: 'avtocesta A1', ne samo 'A1'\n2. Hitre ceste: 'hitra cesta H4', ne samo 'H4'\n3. Regionalne ceste: 'regionalna cesta R2-409'\n4. Mestne ceste: uporabite polno ime, npr. 'Celovška cesta', ne 'Celovška'"}
        ]
    }
]
Add these instructional examples to your training data to explicitly teach rules.
3. Data Augmentation with Rule Enforcement
Create augmented examples that specifically focus on proper naming:
python# Original example
original = {
    "messages": [
        {"role": "user", "content": "Poročaj o stanju na A1."},
        {"role": "assistant", "content": "Na avtocesti A1 med Ljubljano in Mariborom trenutno ni zastojev."}
    ]
}

# Add variations with correct naming to reinforce the rule
variations = [
    {
        "messages": [
            {"role": "user", "content": "Poročaj o stanju na A1 proti Mariboru."},
            {"role": "assistant", "content": "Na avtocesti A1 v smeri proti Mariboru trenutno ni posebnosti."}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "Kako je na A1?"},
            {"role": "assistant", "content": "Na avtocesti A1 Šentilj-Srmin je promet tekoč v obeh smereh."}
        ]
    }
]
4. Post-Processing Rules
For critical naming conventions, you can implement post-processing:
pythondef enforce_road_naming(generated_text):
    # Dictionary of road type patterns and their correct forms
    road_patterns = {
        r'\b(A\d+)\b(?! avtocest)': r'avtocesta \1',  # A1 → avtocesta A1
        r'\b(H\d+)\b(?! hitr)': r'hitra cesta \1',    # H4 → hitra cesta H4
        # Add more patterns as needed
    }
    
    # Apply all patterns
    result = generated_text
    for pattern, replacement in road_patterns.items():
        result = re.sub(pattern, replacement, result)
        
    return result

# Use this after generation
response = generate_traffic_report(prompt)
corrected_response = enforce_road_naming(response)
5. Evaluation and Filtering
Create a specific evaluation metric for road naming compliance:
pythondef evaluate_road_naming(generated_text):
    # Define patterns for improper road naming
    improper_patterns = [
        r'\b(A\d+)\b(?! avtocest)',  # A1 without "avtocesta"
        r'\b(H\d+)\b(?! hitr)',      # H4 without "hitra cesta"
        # Add more patterns
    ]
    
    # Count violations
    violations = 0
    for pattern in improper_patterns:
        violations += len(re.findall(pattern, generated_text))
        
    return {
        "violations": violations,
        "compliant": violations == 0
    }
Practical Implementation Approach
Here's how I recommend implementing these strategies:

Start with data quality: Ensure all your training examples use correct road naming
Add instruction examples: Include specific examples teaching the naming rules
Evaluate during development: Create a custom metric to track naming compliance
Implement post-processing: As a safety net for critical applications

By combining these approaches, you'll significantly improve your model's adherence to specific road naming conventions while still maintaining natural-sounding Slovenian text.
Would you like me to elaborate on any of these strategies or provide more specific examples for Slovenian road naming conventions?

Zero shot prompting

few shot prompting with all the rules exmplained?