# Occupation Code Prediction: TF-IDF vs LLM Embeddings

This notebook compares two text vectorization approaches for predicting occupation ISCO codes:
1. **TF-IDF (Term Frequency-Inverse Document Frequency)**: Traditional statistical method
2. **LLM Embeddings (Sentence Transformers)**: Deep learning-based semantic embeddings

## Dataset
- **Features**: occupation_description, industry_description, employment_sector
- **Target**: occupation_isco_code (445 classes)
- **Size**: 111,664 samples

## 1. Setup and Data Loading

In [None]:
# Install required packages
!pip install sentence-transformers scikit-learn pandas numpy matplotlib seaborn -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sentence_transformers import SentenceTransformer
import time
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
df = pd.read_csv('lfs.csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

## 2. Exploratory Data Analysis

In [None]:
# Target variable distribution
print("Number of unique occupation codes:", df['occupation_isco_code'].nunique())
print("\nTop 15 most common occupations:")
top_occupations = df['occupation_isco_code'].value_counts().head(15)
print(top_occupations)

# Visualize
plt.figure(figsize=(12, 6))
top_occupations.plot(kind='bar', color='steelblue')
plt.title('Top 15 Most Common Occupation Codes')
plt.xlabel('Occupation ISCO Code')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Employment sector distribution
print("\nEmployment Sector Distribution:")
print(df['employment_sector'].value_counts())

plt.figure(figsize=(10, 5))
df['employment_sector'].value_counts().plot(kind='bar', color='coral')
plt.title('Employment Sector Distribution')
plt.xlabel('Sector')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Text length analysis
df['occupation_length'] = df['occupation_description'].str.len()
df['industry_length'] = df['industry_description'].str.len()

print("\nText Length Statistics:")
print("\nOccupation Description:")
print(df['occupation_length'].describe())
print("\nIndustry Description:")
print(df['industry_length'].describe())

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(df['occupation_length'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_title('Occupation Description Length Distribution')
axes[0].set_xlabel('Character Length')
axes[0].set_ylabel('Frequency')

axes[1].hist(df['industry_length'], bins=50, color='lightcoral', edgecolor='black')
axes[1].set_title('Industry Description Length Distribution')
axes[1].set_xlabel('Character Length')
axes[1].set_ylabel('Frequency')
plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Combine text features into a single column
df['combined_text'] = (df['occupation_description'].astype(str) + ' ' + 
                       df['industry_description'].astype(str) + ' ' + 
                       df['employment_sector'].astype(str))

print("Sample combined text:")
print(df['combined_text'].iloc[0])
print("\nCombined text length:")
print(df['combined_text'].str.len().describe())

In [None]:
# Prepare features and target
X = df['combined_text']
y = df['occupation_isco_code']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train):,}")
print(f"Test set size: {len(X_test):,}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts().head())

## 4. Approach 1: TF-IDF Vectorization

TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical method that:
- Represents text as sparse vectors based on word frequencies
- Weights words by their importance (rare words get higher weights)
- Fast and interpretable
- No semantic understanding

In [None]:
print("="*60)
print("APPROACH 1: TF-IDF VECTORIZATION")
print("="*60)

# Create TF-IDF vectorizer
start_time = time.time()
print("\n[1/3] Creating TF-IDF vectors...")

tfidf = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    ngram_range=(1, 2),  # Use unigrams and bigrams
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.8  # Ignore terms that appear in more than 80% of documents
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

vectorization_time = time.time() - start_time
print(f"   ✓ Vectorization completed in {vectorization_time:.2f} seconds")
print(f"   ✓ Feature matrix shape: {X_train_tfidf.shape}")
print(f"   ✓ Sparsity: {(1 - X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1])) * 100:.2f}%")

### 4.1 Model Training: Logistic Regression

In [None]:
print("\n[2/3] Training Logistic Regression model...")
start_time = time.time()

lr_tfidf = LogisticRegression(
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
lr_tfidf.fit(X_train_tfidf, y_train)

training_time = time.time() - start_time
print(f"   ✓ Training completed in {training_time:.2f} seconds")

In [None]:
print("\n[3/3] Evaluating model...")
start_time = time.time()

y_pred_tfidf = lr_tfidf.predict(X_test_tfidf)

prediction_time = time.time() - start_time

# Calculate metrics
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
f1_macro_tfidf = f1_score(y_test, y_pred_tfidf, average='macro')
f1_weighted_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

print(f"   ✓ Prediction completed in {prediction_time:.2f} seconds")
print("\n" + "="*60)
print("TF-IDF RESULTS")
print("="*60)
print(f"Accuracy:          {accuracy_tfidf*100:.2f}%")
print(f"F1-Score (Macro):  {f1_macro_tfidf:.4f}")
print(f"F1-Score (Weighted): {f1_weighted_tfidf:.4f}")
print(f"\nTiming:")
print(f"  Vectorization:   {vectorization_time:.2f}s")
print(f"  Training:        {training_time:.2f}s")
print(f"  Prediction:      {prediction_time:.2f}s")
print(f"  Total:           {vectorization_time + training_time + prediction_time:.2f}s")

### 4.2 Alternative: Random Forest with TF-IDF

In [None]:
print("\nTraining Random Forest with TF-IDF...")
start_time = time.time()

rf_tfidf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
rf_tfidf.fit(X_train_tfidf, y_train)

rf_training_time = time.time() - start_time
y_pred_rf_tfidf = rf_tfidf.predict(X_test_tfidf)

accuracy_rf_tfidf = accuracy_score(y_test, y_pred_rf_tfidf)
f1_weighted_rf_tfidf = f1_score(y_test, y_pred_rf_tfidf, average='weighted')

print(f"\nRandom Forest + TF-IDF Results:")
print(f"Accuracy:          {accuracy_rf_tfidf*100:.2f}%")
print(f"F1-Score (Weighted): {f1_weighted_rf_tfidf:.4f}")
print(f"Training Time:     {rf_training_time:.2f}s")

## 5. Approach 2: LLM Embeddings (Sentence Transformers)

Sentence Transformers create dense semantic embeddings that:
- Capture semantic meaning and context
- Generate fixed-size dense vectors
- Pre-trained on large corpora
- Better generalization but slower and more resource-intensive

In [None]:
print("="*60)
print("APPROACH 2: LLM EMBEDDINGS (SENTENCE TRANSFORMERS)")
print("="*60)

# Load pre-trained model
print("\n[1/4] Loading Sentence Transformer model...")
# Using a lightweight multilingual model
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
print("   ✓ Model loaded successfully")
print(f"   ✓ Model: paraphrase-multilingual-MiniLM-L12-v2")
print(f"   ✓ Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")

In [None]:
print("\n[2/4] Generating embeddings for training set...")
start_time = time.time()

# Generate embeddings (with progress bar)
X_train_embeddings = embedding_model.encode(
    X_train.tolist(),
    show_progress_bar=True,
    batch_size=32
)

train_embedding_time = time.time() - start_time
print(f"   ✓ Training embeddings generated in {train_embedding_time:.2f} seconds")
print(f"   ✓ Shape: {X_train_embeddings.shape}")

In [None]:
print("\n[3/4] Generating embeddings for test set...")
start_time = time.time()

X_test_embeddings = embedding_model.encode(
    X_test.tolist(),
    show_progress_bar=True,
    batch_size=32
)

test_embedding_time = time.time() - start_time
print(f"   ✓ Test embeddings generated in {test_embedding_time:.2f} seconds")
print(f"   ✓ Shape: {X_test_embeddings.shape}")

total_embedding_time = train_embedding_time + test_embedding_time

### 5.1 Model Training: Logistic Regression

In [None]:
print("\n[4/4] Training Logistic Regression model...")
start_time = time.time()

lr_embeddings = LogisticRegression(
    max_iter=1000,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
lr_embeddings.fit(X_train_embeddings, y_train)

training_time_emb = time.time() - start_time
print(f"   ✓ Training completed in {training_time_emb:.2f} seconds")

In [None]:
print("\nEvaluating model...")
start_time = time.time()

y_pred_embeddings = lr_embeddings.predict(X_test_embeddings)

prediction_time_emb = time.time() - start_time

# Calculate metrics
accuracy_embeddings = accuracy_score(y_test, y_pred_embeddings)
f1_macro_embeddings = f1_score(y_test, y_pred_embeddings, average='macro')
f1_weighted_embeddings = f1_score(y_test, y_pred_embeddings, average='weighted')

print(f"   ✓ Prediction completed in {prediction_time_emb:.2f} seconds")
print("\n" + "="*60)
print("LLM EMBEDDINGS RESULTS")
print("="*60)
print(f"Accuracy:          {accuracy_embeddings*100:.2f}%")
print(f"F1-Score (Macro):  {f1_macro_embeddings:.4f}")
print(f"F1-Score (Weighted): {f1_weighted_embeddings:.4f}")
print(f"\nTiming:")
print(f"  Embedding Generation: {total_embedding_time:.2f}s")
print(f"  Training:        {training_time_emb:.2f}s")
print(f"  Prediction:      {prediction_time_emb:.2f}s")
print(f"  Total:           {total_embedding_time + training_time_emb + prediction_time_emb:.2f}s")

### 5.2 Alternative: Random Forest with Embeddings

In [None]:
print("\nTraining Random Forest with Embeddings...")
start_time = time.time()

rf_embeddings = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    random_state=42,
    n_jobs=-1,
    verbose=0
)
rf_embeddings.fit(X_train_embeddings, y_train)

rf_training_time_emb = time.time() - start_time
y_pred_rf_embeddings = rf_embeddings.predict(X_test_embeddings)

accuracy_rf_embeddings = accuracy_score(y_test, y_pred_rf_embeddings)
f1_weighted_rf_embeddings = f1_score(y_test, y_pred_rf_embeddings, average='weighted')

print(f"\nRandom Forest + Embeddings Results:")
print(f"Accuracy:          {accuracy_rf_embeddings*100:.2f}%")
print(f"F1-Score (Weighted): {f1_weighted_rf_embeddings:.4f}")
print(f"Training Time:     {rf_training_time_emb:.2f}s")

## 6. Comparison and Visualization

In [None]:
# Create comparison dataframe
comparison = pd.DataFrame({
    'Method': ['TF-IDF + LR', 'TF-IDF + RF', 'Embeddings + LR', 'Embeddings + RF'],
    'Accuracy': [
        accuracy_tfidf * 100,
        accuracy_rf_tfidf * 100,
        accuracy_embeddings * 100,
        accuracy_rf_embeddings * 100
    ],
    'F1 (Weighted)': [
        f1_weighted_tfidf,
        f1_weighted_rf_tfidf,
        f1_weighted_embeddings,
        f1_weighted_rf_embeddings
    ],
    'Total Time (s)': [
        vectorization_time + training_time + prediction_time,
        vectorization_time + rf_training_time,
        total_embedding_time + training_time_emb + prediction_time_emb,
        total_embedding_time + rf_training_time_emb
    ]
})

print("\n" + "="*80)
print("FINAL COMPARISON")
print("="*80)
print(comparison.to_string(index=False))
print("="*80)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Accuracy comparison
axes[0].bar(comparison['Method'], comparison['Accuracy'], color=['steelblue', 'skyblue', 'coral', 'lightsalmon'])
axes[0].set_title('Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy (%)')
axes[0].set_ylim([0, 100])
axes[0].tick_params(axis='x', rotation=45)
for i, v in enumerate(comparison['Accuracy']):
    axes[0].text(i, v + 1, f'{v:.2f}%', ha='center', va='bottom', fontweight='bold')

# F1 Score comparison
axes[1].bar(comparison['Method'], comparison['F1 (Weighted)'], color=['steelblue', 'skyblue', 'coral', 'lightsalmon'])
axes[1].set_title('F1 Score (Weighted) Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylabel('F1 Score')
axes[1].set_ylim([0, 1])
axes[1].tick_params(axis='x', rotation=45)
for i, v in enumerate(comparison['F1 (Weighted)']):
    axes[1].text(i, v + 0.02, f'{v:.4f}', ha='center', va='bottom', fontweight='bold')

# Time comparison
axes[2].bar(comparison['Method'], comparison['Total Time (s)'], color=['steelblue', 'skyblue', 'coral', 'lightsalmon'])
axes[2].set_title('Total Time Comparison', fontsize=14, fontweight='bold')
axes[2].set_ylabel('Time (seconds)')
axes[2].tick_params(axis='x', rotation=45)
for i, v in enumerate(comparison['Total Time (s)']):
    axes[2].text(i, v + max(comparison['Total Time (s)'])*0.02, f'{v:.1f}s', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 7. Detailed Performance Analysis

In [None]:
# Show classification report for best performing model
best_model_name = comparison.loc[comparison['Accuracy'].idxmax(), 'Method']
print(f"\nDetailed Classification Report for Best Model: {best_model_name}")
print("="*80)

if 'Embeddings' in best_model_name:
    if 'RF' in best_model_name:
        y_pred_best = y_pred_rf_embeddings
    else:
        y_pred_best = y_pred_embeddings
else:
    if 'RF' in best_model_name:
        y_pred_best = y_pred_rf_tfidf
    else:
        y_pred_best = y_pred_tfidf

print(classification_report(y_test, y_pred_best, zero_division=0))

In [None]:
# Per-class accuracy for top 10 classes
from sklearn.metrics import accuracy_score

top_10_classes = y_train.value_counts().head(10).index

class_accuracies_tfidf = []
class_accuracies_emb = []

for cls in top_10_classes:
    mask = y_test == cls
    if mask.sum() > 0:
        acc_tfidf = accuracy_score(y_test[mask], y_pred_tfidf[mask])
        acc_emb = accuracy_score(y_test[mask], y_pred_embeddings[mask])
        class_accuracies_tfidf.append(acc_tfidf * 100)
        class_accuracies_emb.append(acc_emb * 100)

# Plot per-class accuracy
x = np.arange(len(top_10_classes))
width = 0.35

fig, ax = plt.subplots(figsize=(14, 6))
bars1 = ax.bar(x - width/2, class_accuracies_tfidf, width, label='TF-IDF + LR', color='steelblue')
bars2 = ax.bar(x + width/2, class_accuracies_emb, width, label='Embeddings + LR', color='coral')

ax.set_xlabel('Occupation Code')
ax.set_ylabel('Accuracy (%)')
ax.set_title('Per-Class Accuracy for Top 10 Most Common Occupations')
ax.set_xticks(x)
ax.set_xticklabels([f'{int(c)}' for c in top_10_classes], rotation=45)
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Error Analysis

In [None]:
# Compare predictions
comparison_df = pd.DataFrame({
    'text': X_test.values,
    'true_label': y_test.values,
    'tfidf_pred': y_pred_tfidf,
    'embedding_pred': y_pred_embeddings
})

# Cases where embeddings got it right but TF-IDF didn't
embeddings_better = comparison_df[
    (comparison_df['embedding_pred'] == comparison_df['true_label']) &
    (comparison_df['tfidf_pred'] != comparison_df['true_label'])
]

# Cases where TF-IDF got it right but embeddings didn't
tfidf_better = comparison_df[
    (comparison_df['tfidf_pred'] == comparison_df['true_label']) &
    (comparison_df['embedding_pred'] != comparison_df['true_label'])
]

print(f"\nCases where Embeddings outperformed TF-IDF: {len(embeddings_better)}")
print(f"Cases where TF-IDF outperformed Embeddings: {len(tfidf_better)}")
print(f"\nExamples where Embeddings performed better:")
print(embeddings_better[['text', 'true_label', 'tfidf_pred', 'embedding_pred']].head(5))

## 9. Key Insights and Recommendations

### Summary

#### TF-IDF Approach:
**Pros:**
- Fast training and inference
- Low computational requirements
- Interpretable (can see which words are important)
- Good baseline performance

**Cons:**
- No semantic understanding
- Sensitive to vocabulary and spelling variations
- Struggles with synonyms and paraphrases
- High-dimensional sparse vectors

#### LLM Embeddings Approach:
**Pros:**
- Captures semantic meaning
- Better generalization to unseen text
- Robust to spelling variations and synonyms
- Dense, fixed-size representations
- Pre-trained on large multilingual corpora

**Cons:**
- Slower (especially embedding generation)
- Higher computational requirements
- Less interpretable
- Requires more memory

### Recommendations:

1. **For Production Systems**: 
   - Use **TF-IDF** if speed and computational efficiency are critical
   - Use **LLM Embeddings** if accuracy and semantic understanding are more important

2. **Hybrid Approach**:
   - Consider combining both methods (ensemble)
   - Use TF-IDF for initial filtering, then embeddings for refinement

3. **For Kinyarwanda Data**:
   - LLM embeddings may perform better due to multilingual pre-training
   - Consider fine-tuning embedding models on Kinyarwanda text
   - Explore African language-specific models (AfroXLMR)

4. **Future Improvements**:
   - Try different embedding models (larger models, domain-specific)
   - Experiment with advanced classifiers (XGBoost, Neural Networks)
   - Use data augmentation for rare classes
   - Implement class balancing techniques

## 10. Save Best Model

In [None]:
import pickle

# Save the best performing model
best_idx = comparison['Accuracy'].idxmax()
best_method = comparison.loc[best_idx, 'Method']

print(f"\nSaving best model: {best_method}")

if 'Embeddings' in best_method:
    if 'RF' in best_method:
        model_to_save = rf_embeddings
        filename = 'best_model_embeddings_rf.pkl'
    else:
        model_to_save = lr_embeddings
        filename = 'best_model_embeddings_lr.pkl'
else:
    if 'RF' in best_method:
        model_to_save = rf_tfidf
        filename = 'best_model_tfidf_rf.pkl'
        # Also save vectorizer
        with open('tfidf_vectorizer.pkl', 'wb') as f:
            pickle.dump(tfidf, f)
        print("   ✓ TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'")
    else:
        model_to_save = lr_tfidf
        filename = 'best_model_tfidf_lr.pkl'
        # Also save vectorizer
        with open('tfidf_vectorizer.pkl', 'wb') as f:
            pickle.dump(tfidf, f)
        print("   ✓ TF-IDF vectorizer saved as 'tfidf_vectorizer.pkl'")

with open(filename, 'wb') as f:
    pickle.dump(model_to_save, f)

print(f"   ✓ Best model saved as '{filename}'")
print(f"\nTo use the model:")
print(f"```python")
print(f"import pickle")
print(f"with open('{filename}', 'rb') as f:")
print(f"    model = pickle.load(f)")
if 'tfidf' in filename.lower():
    print(f"with open('tfidf_vectorizer.pkl', 'rb') as f:")
    print(f"    vectorizer = pickle.load(f)")
print(f"```")

## 11. Prediction Example

In [None]:
# Example prediction
sample_texts = [
    "UMUSHOFERI WA HULUX ENTREPRISE Y'UBWUBATSI,GUTANGA UMURIRO N'AMAZI Private",
    "COMPTABLE BACURUZA IBIKORESHO BYA MUDASOMBWA NIBYITUMANAHO Private",
    "UMUKOZI WO MURUGO IMIRIMO YO MURUGO Household"
]

print("\nSample Predictions:\n")
print("="*80)

for i, text in enumerate(sample_texts, 1):
    # TF-IDF prediction
    text_tfidf = tfidf.transform([text])
    pred_tfidf = lr_tfidf.predict(text_tfidf)[0]
    
    # Embedding prediction
    text_embedding = embedding_model.encode([text])
    pred_embedding = lr_embeddings.predict(text_embedding)[0]
    
    print(f"\nSample {i}:")
    print(f"Text: {text}")
    print(f"TF-IDF Prediction:    {pred_tfidf}")
    print(f"Embedding Prediction: {pred_embedding}")
    print("-" * 80)