# Romanian Training Data Exploration

This notebook helps explore and analyze Romanian training data for Llama fine-tuning.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import re

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

Load the prepared training and validation data.

In [None]:
def load_jsonl(file_path):
    """Load JSONL file into a list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load training data
train_data = load_jsonl('../data/splits/train.jsonl')
print(f"Loaded {len(train_data)} training examples")

# Load validation data
val_data = load_jsonl('../data/splits/val.jsonl')
print(f"Loaded {len(val_data)} validation examples")

## 2. Data Structure Analysis

Examine the structure of the training examples.

In [None]:
# Show first example
print("Example training instance:")
print(json.dumps(train_data[0], indent=2, ensure_ascii=False))

In [None]:
# Extract message statistics
def extract_message_info(data):
    """Extract information from messages."""
    info = []
    
    for example in data:
        messages = example['messages']
        
        user_msg = ''
        assistant_msg = ''
        
        for msg in messages:
            if msg['role'] == 'user':
                user_msg = msg['content']
            elif msg['role'] == 'assistant':
                assistant_msg = msg['content']
        
        info.append({
            'user_msg': user_msg,
            'assistant_msg': assistant_msg,
            'user_length': len(user_msg),
            'assistant_length': len(assistant_msg),
            'total_length': len(user_msg) + len(assistant_msg)
        })
    
    return pd.DataFrame(info)

train_df = extract_message_info(train_data)
print("\nTraining data statistics:")
print(train_df[['user_length', 'assistant_length', 'total_length']].describe())

## 3. Length Distribution Analysis

In [None]:
# Plot length distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# User message length
axes[0].hist(train_df['user_length'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_title('User Message Length Distribution')
axes[0].set_xlabel('Characters')
axes[0].set_ylabel('Frequency')

# Assistant message length
axes[1].hist(train_df['assistant_length'], bins=50, color='lightcoral', edgecolor='black')
axes[1].set_title('Assistant Message Length Distribution')
axes[1].set_xlabel('Characters')
axes[1].set_ylabel('Frequency')

# Total length
axes[2].hist(train_df['total_length'], bins=50, color='lightgreen', edgecolor='black')
axes[2].set_title('Total Conversation Length Distribution')
axes[2].set_xlabel('Characters')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. Content Analysis

Analyze the content of user prompts and assistant responses.

In [None]:
# Extract common words from user messages
def get_common_words(text_series, top_n=20):
    """Get most common words from a text series."""
    # Combine all text
    all_text = ' '.join(text_series.values)
    
    # Tokenize (simple word extraction)
    words = re.findall(r'\b[a-zăâîșțĂÂÎȘȚ]{3,}\b', all_text.lower())
    
    # Count
    word_counts = Counter(words)
    
    return word_counts.most_common(top_n)

# Common words in user prompts
user_common = get_common_words(train_df['user_msg'], top_n=15)
print("Most common words in user prompts:")
for word, count in user_common:
    print(f"  {word}: {count}")

In [None]:
# Visualize common words
words, counts = zip(*user_common)

plt.figure(figsize=(12, 6))
plt.barh(words, counts, color='steelblue')
plt.xlabel('Frequency')
plt.title('Most Common Words in User Prompts')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Sample Examples

View random examples from the dataset.

In [None]:
import random

# Show 5 random examples
print("Random training examples:\n")
for i in range(5):
    idx = random.randint(0, len(train_df)-1)
    print(f"Example {i+1}:")
    print(f"  User: {train_df.iloc[idx]['user_msg'][:100]}...")
    print(f"  Assistant: {train_df.iloc[idx]['assistant_msg'][:150]}...")
    print()

## 6. Quality Checks

Perform basic quality checks on the data.

In [None]:
# Check for empty messages
empty_user = (train_df['user_length'] == 0).sum()
empty_assistant = (train_df['assistant_length'] == 0).sum()

print(f"Empty user messages: {empty_user}")
print(f"Empty assistant messages: {empty_assistant}")

# Check for very short messages
short_user = (train_df['user_length'] < 10).sum()
short_assistant = (train_df['assistant_length'] < 20).sum()

print(f"\nVery short user messages (<10 chars): {short_user}")
print(f"Very short assistant messages (<20 chars): {short_assistant}")

# Check for very long messages (might need truncation)
long_total = (train_df['total_length'] > 2000).sum()
print(f"\nVery long conversations (>2000 chars): {long_total}")

## 7. Romanian Diacritics Check

Verify Romanian-specific characters are present.

In [None]:
# Check for Romanian diacritics
def has_romanian_chars(text):
    """Check if text contains Romanian diacritics."""
    romanian_chars = r'[ăâîșțĂÂÎȘȚ]'
    return bool(re.search(romanian_chars, text))

# Count examples with Romanian characters
with_diacritics = train_df['assistant_msg'].apply(has_romanian_chars).sum()
percentage = (with_diacritics / len(train_df)) * 100

print(f"Examples with Romanian diacritics: {with_diacritics}/{len(train_df)} ({percentage:.1f}%)")

# Show examples with diacritics
print("\nExample with diacritics:")
for idx, row in train_df.iterrows():
    if has_romanian_chars(row['assistant_msg']):
        print(f"  {row['assistant_msg'][:100]}...")
        break

## 8. Train/Val Split Comparison

Compare training and validation distributions.

In [None]:
val_df = extract_message_info(val_data)

print("Training vs Validation Statistics:")
print("\nTraining:")
print(train_df[['user_length', 'assistant_length', 'total_length']].describe())
print("\nValidation:")
print(val_df[['user_length', 'assistant_length', 'total_length']].describe())

In [None]:
# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# User message length comparison
axes[0].hist(train_df['user_length'], bins=30, alpha=0.5, label='Train', color='blue')
axes[0].hist(val_df['user_length'], bins=30, alpha=0.5, label='Val', color='orange')
axes[0].set_title('User Message Length Comparison')
axes[0].set_xlabel('Characters')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Assistant message length comparison
axes[1].hist(train_df['assistant_length'], bins=30, alpha=0.5, label='Train', color='blue')
axes[1].hist(val_df['assistant_length'], bins=30, alpha=0.5, label='Val', color='orange')
axes[1].set_title('Assistant Message Length Comparison')
axes[1].set_xlabel('Characters')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.show()

## 9. Export Summary

Create a summary report of the data.

In [None]:
summary = {
    'dataset': {
        'train_size': len(train_data),
        'val_size': len(val_data),
        'total_size': len(train_data) + len(val_data)
    },
    'statistics': {
        'avg_user_length': train_df['user_length'].mean(),
        'avg_assistant_length': train_df['assistant_length'].mean(),
        'avg_total_length': train_df['total_length'].mean(),
        'max_total_length': train_df['total_length'].max(),
        'min_total_length': train_df['total_length'].min()
    },
    'quality': {
        'examples_with_diacritics': int(with_diacritics),
        'diacritics_percentage': float(percentage),
        'empty_messages': int(empty_user + empty_assistant),
        'short_messages': int(short_user + short_assistant),
        'long_conversations': int(long_total)
    }
}

print("\nDataset Summary:")
print(json.dumps(summary, indent=2))

# Save to file
with open('../data/data_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
    
print("\nSummary saved to ../data/data_summary.json")

## Conclusions

Based on the analysis:
1. Review the length distributions - ensure they're appropriate for your use case
2. Check the quality metrics - address any issues (empty messages, very short/long messages)
3. Verify Romanian diacritics are present - essential for proper Romanian language modeling
4. Ensure train/val distributions are similar

If everything looks good, proceed with training!