In [3]:
!python -m pip install --user torch torchvision torchaudio transformers sentence-transformers chromadb scikit-learn pandas matplotlib seaborn 



In [4]:

import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ===========================
# Step 1: Load and clean data
# ===========================
DIR_DATA = Path.cwd().parents[0].joinpath("project4")
df = pd.read_csv(DIR_DATA / "lfs_data2.csv", encoding="ISO-8859-1")

# Select features and target
X = df[['D03B']]
y = df['D03B1']

# Drop rows with missing data
data = pd.concat([X, y], axis=1).dropna(subset=['D03B', 'D03B1'])
X = data[['D03B']]
y = data['D03B1']

# Keep only classes with 2+ samples
class_counts = y.value_counts()
valid_classes = class_counts[class_counts > 1].index
X = X[y.isin(valid_classes)]
y = y[y.isin(valid_classes)]

print(f"Dataset: {len(X)} samples, {len(y.unique())} unique ISIC codes")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# ===========================
# Step 2: Enhanced Multilingual Preprocessor
# ===========================
print("\n=== Building Multilingual Feature Extractors ===")

# Word-level TF-IDF (captures semantic meaning)
word_tfidf = TfidfVectorizer(
    max_features=3000,
    min_df=2,
    ngram_range=(1, 2),  # unigrams and bigrams
    analyzer='word',
    strip_accents='unicode',
    lowercase=True
)

# Character-level TF-IDF (captures Kinyarwanda morphology)
# Kinyarwanda has rich morphology with prefixes/suffixes
char_tfidf = TfidfVectorizer(
    max_features=2000,
    analyzer='char',
    ngram_range=(2, 5),  # character n-grams
    strip_accents='unicode',
    lowercase=True
)

# Combine both feature extractors
combined_features = FeatureUnion([
    ('word_features', word_tfidf),
    ('char_features', char_tfidf)
])

print("✓ Word-level TF-IDF: captures semantic patterns")
print("✓ Character-level TF-IDF: captures Kinyarwanda morphology")

# ===========================
# Step 3: Define Enhanced Pipelines
# ===========================
pipelines = {
    'Logistic Regression (Enhanced)': Pipeline([
        ('features', combined_features),
        ('clf', LogisticRegression(
            max_iter=2000,
            C=1.0,
            class_weight='balanced',  # Handle class imbalance
            random_state=42
        ))
    ]),
    'Random Forest (Enhanced)': Pipeline([
        ('features', combined_features),
        ('clf', RandomForestClassifier(
            n_estimators=200,
            max_depth=25,
            min_samples_split=5,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ))
    ]),
    'KNN (Enhanced)': Pipeline([
        ('features', combined_features),
        ('clf', KNeighborsClassifier(
            n_neighbors=7,
            weights='distance',  # Closer neighbors have more influence
            metric='cosine'  # Better for text
        ))
    ])
}

# ===========================
# Step 4: Train & Evaluate
# ===========================
print("\n=== Training Models ===")
results = []
best_model = None
best_test_acc = 0
best_model_name = ""

for name, pipe in pipelines.items():
    print(f"\nTraining {name}...")
    pipe.fit(X_train['D03B'], y_train)

    y_train_pred = pipe.predict(X_train['D03B'])
    y_test_pred = pipe.predict(X_test['D03B'])

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    gap = train_acc - test_acc

    results.append({
        'Model': name,
        'Train Accuracy': round(train_acc, 4),
        'Test Accuracy': round(test_acc, 4),
        'Overfitting Gap': round(gap, 4)
    })
    
    print(f"  Train Accuracy: {train_acc:.4f}")
    print(f"  Test Accuracy: {test_acc:.4f}")
    
    # Track best model
    if test_acc > best_test_acc:
        best_test_acc = test_acc
        best_model = pipe
        best_model_name = name

# ===========================
# Step 5: Display Results
# ===========================
results_df = pd.DataFrame(results).sort_values(by='Test Accuracy', ascending=False)
print("\n" + "="*60)
print("=== Model Comparison (Multilingual-Enhanced) ===")
print("="*60)
print(results_df.to_string(index=False))
print("="*60)

# Detailed classification report for best model
print(f"\n=== Detailed Report for {best_model_name} ===")
y_pred_best = best_model.predict(X_test['D03B'])
print(classification_report(y_test, y_pred_best, zero_division=0))

# ===========================
# Step 6: Save Best Model
# ===========================
model_filename = 'best_text_classifier.pkl'
joblib.dump(best_model, model_filename)
print(f"\n✓ Best model ({best_model_name}) saved as '{model_filename}'")
print(f"  Test Accuracy: {best_test_acc:.4f}")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'test_accuracy': best_test_acc,
    'all_results': results_df.to_dict('records'),
    'features': 'Word TF-IDF + Character TF-IDF (Multilingual)',
    'total_samples': len(X),
    'num_classes': len(y.unique()),
    'languages_supported': ['English', 'Kinyarwanda', 'French']
}
joblib.dump(metadata, 'model_metadata.pkl')
print(f"✓ Model metadata saved as 'model_metadata.pkl'")

print("\n" + "="*60)
print("✓ Training Complete!")
print("="*60)
print("\nEnhancements for Kinyarwanda:")
print("  • Character n-grams (2-5): Captures morphological patterns")
print("  • Word n-grams (1-2): Captures semantic meaning")
print("  • Balanced class weights: Handles imbalanced data")
print("  • Unicode normalization: Handles accented characters")
print("\nYour model is now more powerful for multilingual text!")

Dataset: 126817 samples, 396 unique ISIC codes

=== Building Multilingual Feature Extractors ===
✓ Word-level TF-IDF: captures semantic patterns
✓ Character-level TF-IDF: captures Kinyarwanda morphology

=== Training Models ===

Training Logistic Regression (Enhanced)...
  Train Accuracy: 0.7246
  Test Accuracy: 0.6830

Training Random Forest (Enhanced)...
  Train Accuracy: 0.7459
  Test Accuracy: 0.6512

Training KNN (Enhanced)...
  Train Accuracy: 0.8913
  Test Accuracy: 0.7651

=== Model Comparison (Multilingual-Enhanced) ===
                         Model  Train Accuracy  Test Accuracy  Overfitting Gap
                KNN (Enhanced)          0.8913         0.7651           0.1262
Logistic Regression (Enhanced)          0.7246         0.6830           0.0416
      Random Forest (Enhanced)          0.7459         0.6512           0.0948

=== Detailed Report for KNN (Enhanced) ===
              precision    recall  f1-score   support

         111       0.90      0.95      0.93      5