## collection: scrape data

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://scancode-licensedb.aboutcode.org/"

def scrape_json_links(url):
    session = requests.Session()   # reuse connections
    session.headers.update({"User-Agent": "Mozilla/5.0"})

    response = session.get(url, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    rows = []
    links = soup.select("a")

    json_links = [
        BASE_URL + a["href"]
        for a in links
        if a.text.strip() == "json"
    ]

    print("Found:", len(json_links), "json links")

    for link in json_links:
        try:
            r = session.get(link, timeout=10)
            data = r.json()
            rows.append({
                "key": data.get("key"),
                "text": data.get("text"),
                "category": data.get("category")
            })
        except requests.exceptions.JSONDecodeError:
            print(f"skipped invalid JSON from {link}")
            continue

    return pd.DataFrame(rows)

df = scrape_json_links(BASE_URL)
print(df.head())

save scraped data in csv

In [None]:
import pandas as pd

df.to_csv('data.csv', index=False)

read data as dataframe

In [2]:
import pandas as pd

file_path = 'data.csv'

try:
    df = pd.read_csv(file_path)
    print(df.head())
except FileNotFoundError:
    print(f"'{file_path}' was not found.")
except Exception as e:
    print(f"error occurred: {e}")

              key                                               text  \
0   389-exception  In addition, as a special exception, Red Hat, ...   
1  3com-microcode  Redistribution and use in source and binary fo...   
2    3dslicer-1.0  3D Slicer Contribution and Software License Ag...   
3      4suite-1.1  License and copyright info for 4Suite software...   
4     996-icu-1.0  "Anti 996" License Version 1.0 (Draft)\n\nPerm...   

           category  
0  Copyleft Limited  
1        Permissive  
2        Permissive  
3        Permissive  
4   Free Restricted  


# License Prediction: Multiple Approaches for Single-Instance Labels

This notebook explores different techniques to predict licenses when each label appears only once in the dataset.

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet
import random
import warnings

warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

# Check data shape and class distribution
print(f"Dataset shape: {df.shape}")
print(f"\nUnique licenses: {df['category'].nunique()}")
print(f"Unique license keys: {df['key'].nunique()}")
print(f"\nClass distribution:\n{df['category'].value_counts()}")
print(f"\nSample of data:")
print(df.head())

Dataset shape: (2614, 3)

Unique licenses: 11
Unique license keys: 2614

Class distribution:
category
Permissive          970
Proprietary Free    622
Copyleft Limited    407
Copyleft            176
Commercial          137
Free Restricted     101
Source-available     98
Public Domain        40
CLA                  22
Patent License       21
Unstated License     20
Name: count, dtype: int64

Sample of data:
              key                                               text  \
0   389-exception  In addition, as a special exception, Red Hat, ...   
1  3com-microcode  Redistribution and use in source and binary fo...   
2    3dslicer-1.0  3D Slicer Contribution and Software License Ag...   
3      4suite-1.1  License and copyright info for 4Suite software...   
4     996-icu-1.0  "Anti 996" License Version 1.0 (Draft)\n\nPerm...   

           category  
0  Copyleft Limited  
1        Permissive  
2        Permissive  
3        Permissive  
4   Free Restricted  


## Approach 1: Data Augmentation (Paraphrasing & Synonym Replacement)

In [5]:
def get_synonyms(word, pos_tag):
    """Get synonyms for a word using WordNet"""
    synsets = wordnet.synsets(word, pos=pos_tag)
    synonyms = set()
    for synset in synsets:
        for lemma in synset.lemmas():
            if lemma.name() != word:
                synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

def augment_text_synonym_replacement(text, num_augmented=3, replacement_ratio=0.3):
    """Replace words with synonyms to create augmented samples"""
    # Handle NaN and non-string values
    if pd.isna(text) or not isinstance(text, str):
        return []
    
    words = text.split()
    if len(words) == 0:
        return []
    
    augmented_texts = []
    
    for _ in range(num_augmented):
        augmented_words = words.copy()
        num_replacements = max(1, int(len(words) * replacement_ratio))
        
        indices_to_replace = random.sample(range(len(words)), 
                                          min(num_replacements, len(words)))
        
        for idx in indices_to_replace:
            word = words[idx].lower()
            # Try different POS tags
            for pos in ['n', 'v', 'a', 'r']:
                synonyms = get_synonyms(word, pos)
                if synonyms:
                    augmented_words[idx] = random.choice(synonyms)
                    break
        
        augmented_texts.append(' '.join(augmented_words))
    
    return augmented_texts

def augment_text_deletion(text, num_augmented=3, deletion_ratio=0.1):
    """Delete random words to create augmented samples"""
    # Handle NaN and non-string values
    if pd.isna(text) or not isinstance(text, str):
        return []
    
    words = text.split()
    if len(words) == 0:
        return []
    
    augmented_texts = []
    
    for _ in range(num_augmented):
        num_deletions = max(1, int(len(words) * deletion_ratio))
        indices_to_keep = sorted(random.sample(range(len(words)), 
                                               len(words) - num_deletions))
        augmented_text = ' '.join([words[i] for i in indices_to_keep])
        augmented_texts.append(augmented_text)
    
    return augmented_texts

# Clean data: Remove rows with missing text
df_clean = df.dropna(subset=['text']).copy()
df_clean = df_clean[df_clean['text'].str.len() > 0]
print(f"Cleaned data: {len(df_clean)} rows (removed {len(df) - len(df_clean)} invalid rows)")

# Create augmented dataset
augmented_data = []

for idx, row in df_clean.iterrows():
    # Original text
    augmented_data.append({
        'text': row['text'],
        'category': row['category'],
        'source': 'original'
    })
    
    # Synonym replacement augmentation
    try:
        syn_aug = augment_text_synonym_replacement(row['text'], num_augmented=2)
        for aug_text in syn_aug:
            if aug_text:  # Only add non-empty augmentations
                augmented_data.append({
                    'text': aug_text,
                    'category': row['category'],
                    'source': 'synonym_replacement'
                })
    except Exception as e:
        pass
    
    # Deletion augmentation
    try:
        del_aug = augment_text_deletion(row['text'], num_augmented=2)
        for aug_text in del_aug:
            if aug_text:  # Only add non-empty augmentations
                augmented_data.append({
                    'text': aug_text,
                    'category': row['category'],
                    'source': 'deletion'
                })
    except Exception as e:
        pass

df_augmented = pd.DataFrame(augmented_data)
print(f"Original dataset size: {len(df)}")
print(f"Augmented dataset size: {len(df_augmented)}")
print(f"Augmentation ratio: {len(df_augmented) / len(df):.2f}x")
print(f"\nAugmentation sources:\n{df_augmented['source'].value_counts()}")
print(f"\nSample augmented texts:")
sample_key = df['key'].iloc[0]
sample_texts = df_augmented[df_augmented['category'] == df['category'].iloc[0]]['text'].head(3)
for i, text in enumerate(sample_texts, 1):
    print(f"\nAugmented {i}: {text[:200]}...")

Cleaned data: 2593 rows (removed 21 invalid rows)
Original dataset size: 2614
Augmented dataset size: 12965
Augmentation ratio: 4.96x

Augmentation sources:
source
synonym_replacement    5186
deletion               5186
original               2593
Name: count, dtype: int64

Sample augmented texts:

Augmented 1: In addition, as a special exception, Red Hat, Inc. gives You the additional
right to link the code of this Program with code not covered under the GNU
General Public License ("Non-GPL Code") and to di...

Augmented 2: In addition, amp a special exception, Red Hat, Inc. gives You the additional rightfulness to link the code of this Program with code non covered nether the GNU General Public License ("Non-GPL Code") ...

Augmented 3: In addition, as antiophthalmic factor special exception, bolshy Hat, Inc. spring You the additional right field to connectedness the code of this Program with code not treat nether the GNU General Pub...


## Approach 2: Sentence Splitting for More Training Samples

In [6]:
def create_sentence_samples(df, min_sentence_length=20):
    """Split texts into sentences for more training samples"""
    sentence_data = []
    
    for idx, row in df.iterrows():
        try:
            sentences = sent_tokenize(row['text'])
            for sentence in sentences:
                # Filter out very short sentences
                if len(sentence.split()) >= min_sentence_length // 5:
                    sentence_data.append({
                        'text': sentence,
                        'category': row['category'],
                        'sentence_num': len(sentence_data),
                        'original_key': row['key']
                    })
        except:
            # If tokenization fails, keep original text
            sentence_data.append({
                'text': row['text'],
                'category': row['category'],
                'sentence_num': 0,
                'original_key': row['key']
            })
    
    return pd.DataFrame(sentence_data)

# Create sentence-split dataset
df_sentences = create_sentence_samples(df, min_sentence_length=20)

print(f"Original samples: {len(df)}")
print(f"Sentence-split samples: {len(df_sentences)}")
print(f"Average sentences per license: {len(df_sentences) / len(df):.1f}")
print(f"\nClass distribution after sentence splitting:")
print(df_sentences['category'].value_counts().head(10))
print(f"\nSample sentences from first license:")
first_license = df_sentences['category'].iloc[0]
samples = df_sentences[df_sentences['category'] == first_license]['text'].head(3)
for i, sent in enumerate(samples, 1):
    print(f"  Sentence {i}: {sent[:150]}...")

Original samples: 2614
Sentence-split samples: 85085
Average sentences per license: 32.5

Class distribution after sentence splitting:
category
Proprietary Free    32283
Copyleft Limited    14840
Permissive          11295
Commercial           9964
Copyleft             8572
Source-available     5078
Free Restricted      1757
CLA                   613
Public Domain         313
Patent License        309
Name: count, dtype: int64

Sample sentences from first license:
  Sentence 1: In addition, as a special exception, Red Hat, Inc. gives You the additional
right to link the code of this Program with code not covered under the GNU...
  Sentence 2: Non-GPL Code
permitted under this exception must only link to the code of this Program
through those well defined interfaces identified in the file na...
  Sentence 3: The files of Non-GPL
Code may instantiate templates or use macros or inline functions from the
Approved Interfaces without causing the resulting work ...


## Approach 3: Back Translation for Data Augmentation

*Requires: `pip install google-cloud-translate` or use libre translation API*

In [8]:
from transformers import MarianMTModel, MarianTokenizer

def back_translate_text(text, source_lang='en', intermediate_lang='es', max_length=512):
    """
    Perform back translation: EN -> Intermediate -> EN
    Uses Helsinki-NLP models (free, no API key needed)
    """
    try:
        # English to intermediate language
        model_name_1 = f'Helsinki-NLP/Opus-MT-{source_lang}-{intermediate_lang}'
        tokenizer_1 = MarianTokenizer.from_pretrained(model_name_1)
        model_1 = MarianMTModel.from_pretrained(model_name_1)
        
        inputs_1 = tokenizer_1(text, return_tensors="pt", max_length=max_length, truncation=True)
        translated_1 = model_1.generate(**inputs_1)
        intermediate_text = tokenizer_1.decode(translated_1[0], skip_special_tokens=True)
        
        # Intermediate back to English
        model_name_2 = f'Helsinki-NLP/Opus-MT-{intermediate_lang}-{source_lang}'
        tokenizer_2 = MarianTokenizer.from_pretrained(model_name_2)
        model_2 = MarianMTModel.from_pretrained(model_name_2)
        
        inputs_2 = tokenizer_2(intermediate_text, return_tensors="pt", max_length=max_length, truncation=True)
        translated_2 = model_2.generate(**inputs_2)
        back_translated_text = tokenizer_2.decode(translated_2[0], skip_special_tokens=True)
        
        return back_translated_text
    except Exception as e:
        print(f"Back translation failed: {e}")
        return text

# Create back-translated augmented dataset
print("Creating back-translated augmented data...")
print("(This may take a while as it downloads translation models)")

back_trans_data = []
sample_size = min(5, len(df))  # Use subset for demonstration

for idx, row in df.iloc[:sample_size].iterrows():
    # Original
    back_trans_data.append({
        'text': row['text'],
        'category': row['category'],
        'augmentation': 'original'
    })
    
    # Back translation variants
    for lang in ['es', 'fr', 'de']:  # Spanish, French, German
        try:
            back_trans_text = back_translate_text(row['text'][:500], 
                                                  source_lang='en', 
                                                  intermediate_lang=lang)
            back_trans_data.append({
                'text': back_trans_text,
                'category': row['category'],
                'augmentation': f'back_trans_{lang}'
            })
        except:
            continue

df_back_translated = pd.DataFrame(back_trans_data)
print(f"\nBack-translation samples created: {len(df_back_translated)}")
print(f"Augmentation types: {df_back_translated['augmentation'].value_counts().to_dict()}")

# Example of back-translated text
if len(df_back_translated) > 1:
    print(f"\nExample back-translation:")
    orig_idx = df_back_translated[df_back_translated['augmentation'] == 'original'].index[0]
    print(f"Original: {df_back_translated.loc[orig_idx, 'text'][:200]}...")
    
    back_trans_idx = df_back_translated[df_back_translated['augmentation'].str.contains('back_trans', na=False)].index[0]
    print(f"Back-translated: {df_back_translated.loc[back_trans_idx, 'text'][:200]}...")

Creating back-translated augmented data...
(This may take a while as it downloads translation models)

Back-translation samples created: 20
Augmentation types: {'original': 5, 'back_trans_es': 5, 'back_trans_fr': 5, 'back_trans_de': 5}

Example back-translation:
Original: In addition, as a special exception, Red Hat, Inc. gives You the additional
right to link the code of this Program with code not covered under the GNU
General Public License ("Non-GPL Code") and to di...
Back-translated: In addition, as a special exception, Red Hat, Inc. grants you the additional right to link the code of this Program to code not covered by the GNU General Public License ("non-GPL Code") and to distri...


## Approach 4: Transfer Learning with Pretrained Models

*Uses BERT/DistilBERT for license classification*

In [10]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Use sentence-split data for transfer learning
data_to_use = df_sentences.copy()
data_to_use = data_to_use[data_to_use['text'].str.len() > 50]  # Filter short texts

# Encode labels
label_encoder = {label: idx for idx, label in enumerate(data_to_use['category'].unique())}
label_decoder = {idx: label for label, idx in label_encoder.items()}

print(f"Number of unique licenses: {len(label_encoder)}")
print(f"Samples: {len(data_to_use)}")

# Load pretrained tokenizer and model
print("\nLoading DistilBERT model...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to get embeddings
def get_bert_embeddings(texts, tokenizer, model, max_length=512):
    """Get BERT embeddings for texts"""
    embeddings = []
    
    model.eval()
    with torch.no_grad():
        for text in texts:
            # Truncate text if too long
            text = text[:max_length*4]  # Approximate, will be truncated by tokenizer
            
            inputs = tokenizer(text, return_tensors='pt', 
                             max_length=max_length, truncation=True)
            
            outputs = model(**inputs)
            # Use [CLS] token embedding (first token)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze()
            embeddings.append(cls_embedding.numpy())
    
    return np.array(embeddings)

# Get embeddings for enough samples (ensure each class has at least 2 samples for stratification)
# Increase sample size to ensure we have enough samples per class
sample_size = min(500, len(data_to_use))
sample_indices = np.random.choice(len(data_to_use), sample_size, replace=False)
sample_data = data_to_use.iloc[sample_indices].reset_index(drop=True)

print(f"\nComputing BERT embeddings for {len(sample_data)} samples...")
X_bert = get_bert_embeddings(sample_data['text'].tolist(), tokenizer, model)
y = np.array([label_encoder[label] for label in sample_data['category']])

print(f"Embeddings shape: {X_bert.shape}")
print(f"Labels shape: {y.shape}")

# Check if we can stratify (each class needs at least 2 samples)
unique_labels, label_counts = np.unique(y, return_counts=True)
min_class_count = label_counts.min()
can_stratify = min_class_count >= 2

print(f"Class distribution: {dict(zip(unique_labels, label_counts))}")
print(f"Minimum class count: {min_class_count}")
print(f"Can stratify: {can_stratify}")

# Train classifier on BERT embeddings
X_train, X_test, y_train, y_test = train_test_split(X_bert, y, 
                                                      test_size=0.2, 
                                                      random_state=42,
                                                      stratify=y if can_stratify else None)

print(f"\nTraining transfer learning classifier...")
clf_transfer = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_transfer.fit(X_train, y_train)

# Evaluate
y_pred_transfer = clf_transfer.predict(X_test)
acc_transfer = accuracy_score(y_test, y_pred_transfer)

print(f"\n=== Transfer Learning Results ===")
print(f"Accuracy: {acc_transfer:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_transfer, 
                          target_names=[label_decoder[i] for i in np.unique(y_test)],
                          zero_division=0))

Number of unique licenses: 11
Samples: 78383

Loading DistilBERT model...

Computing BERT embeddings for 500 samples...
Embeddings shape: (500, 768)
Labels shape: (500,)
Class distribution: {np.int64(0): np.int64(90), np.int64(1): np.int64(70), np.int64(2): np.int64(8), np.int64(3): np.int64(191), np.int64(4): np.int64(44), np.int64(5): np.int64(59), np.int64(6): np.int64(2), np.int64(7): np.int64(1), np.int64(9): np.int64(34), np.int64(10): np.int64(1)}
Minimum class count: 1
Can stratify: False

Training transfer learning classifier...

=== Transfer Learning Results ===
Accuracy: 0.4000

Classification Report:
                  precision    recall  f1-score   support

Copyleft Limited       0.50      0.15      0.23        20
      Permissive       0.11      0.07      0.08        15
 Free Restricted       0.00      0.00      0.00         1
Proprietary Free       0.42      0.95      0.58        37
        Copyleft       0.00      0.00      0.00        10
      Commercial       1.00    

## Approach 5: Clustering Multiple Licenses Together

In [12]:
# Clustering approach: Group similar licenses together
# Clean data: Remove rows with missing text
df_for_clustering = df.dropna(subset=['text']).copy()
df_for_clustering = df_for_clustering[df_for_clustering['text'].str.len() > 0]
print(f"Cleaned data for clustering: {len(df_for_clustering)} rows (removed {len(df) - len(df_for_clustering)} invalid rows)")

# Create TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=2000, min_df=1, max_df=0.9, 
                             ngram_range=(1, 2), stop_words='english')

print("Creating TF-IDF vectors for clustering...")
X_tfidf = vectorizer.fit_transform(df_for_clustering['text'])

print(f"TF-IDF matrix shape: {X_tfidf.shape}")

# Determine optimal number of clusters using different metrics
# Try clustering with different numbers of clusters
n_clusters_range = range(2, min(10, len(df) // 2 + 1))
silhouette_scores = []

from sklearn.metrics import silhouette_score

print("\nFinding optimal number of clusters...")
for n_clusters in n_clusters_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_tfidf)
    score = silhouette_score(X_tfidf, cluster_labels)
    silhouette_scores.append(score)
    print(f"  Clusters: {n_clusters}, Silhouette Score: {score:.4f}")

optimal_clusters = n_clusters_range[np.argmax(silhouette_scores)]
print(f"\nOptimal number of clusters: {optimal_clusters}")

# Train final clustering model
kmeans_final = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
cluster_assignments = kmeans_final.fit_predict(X_tfidf)

# Add cluster assignments to dataframe
df_clustered = df_for_clustering.copy()
df_clustered['cluster'] = cluster_assignments

# Show cluster composition
print(f"\n=== Cluster Composition ===")
print(f"Licenses per cluster:")
for cluster_id in range(optimal_clusters):
    licenses_in_cluster = df_clustered[df_clustered['cluster'] == cluster_id]['category'].unique()
    print(f"  Cluster {cluster_id}: {len(licenses_in_cluster)} licenses - {list(licenses_in_cluster)[:5]}{'...' if len(licenses_in_cluster) > 5 else ''}")

# Train classifier to predict clusters
print(f"\nTraining classifier to predict clusters...")
y_clusters = cluster_assignments
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_tfidf, y_clusters, 
                                                              test_size=0.2, 
                                                              random_state=42)

clf_clustering = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# Convert sparse matrix to dense if necessary
clf_clustering.fit(X_train_c.toarray() if hasattr(X_train_c, 'toarray') else X_train_c, 
                   y_train_c)

y_pred_c = clf_clustering.predict(X_test_c.toarray() if hasattr(X_test_c, 'toarray') else X_test_c)
acc_clustering = accuracy_score(y_test_c, y_pred_c)

print(f"\n=== Cluster Prediction Results ===")
print(f"Accuracy: {acc_clustering:.4f}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test_c, y_pred_c))

# Function to predict cluster for new text
def predict_cluster(text, vectorizer, kmeans, clf):
    """Predict which cluster a new license text belongs to"""
    X = vectorizer.transform([text])
    # Predict using trained classifier
    cluster = clf.predict(X.toarray()[0].reshape(1, -1))[0]
    
    # Get licenses in this cluster
    licenses_in_cluster = df_clustered[df_clustered['cluster'] == cluster]['category'].unique()
    
    return cluster, licenses_in_cluster

# Test with a sample
if len(df_for_clustering) > 0:
    test_text = df_for_clustering['text'].iloc[0][:300]
    pred_cluster, pred_licenses = predict_cluster(test_text, vectorizer, kmeans_final, clf_clustering)
    print(f"\nExample prediction:")
    print(f"Test text: {test_text}...")
    print(f"Predicted cluster: {pred_cluster}")
    print(f"Licenses in cluster: {list(pred_licenses)}")

Cleaned data for clustering: 2593 rows (removed 21 invalid rows)
Creating TF-IDF vectors for clustering...
TF-IDF matrix shape: (2593, 2000)

Finding optimal number of clusters...
  Clusters: 2, Silhouette Score: 0.0361
  Clusters: 3, Silhouette Score: 0.0438
  Clusters: 4, Silhouette Score: 0.0538
  Clusters: 5, Silhouette Score: 0.0592
  Clusters: 6, Silhouette Score: 0.0639
  Clusters: 7, Silhouette Score: 0.0689
  Clusters: 8, Silhouette Score: 0.0689
  Clusters: 9, Silhouette Score: 0.0706

Optimal number of clusters: 9

=== Cluster Composition ===
Licenses per cluster:
  Cluster 0: 5 licenses - ['Copyleft Limited', 'Copyleft', 'Proprietary Free', 'Permissive', 'CLA']
  Cluster 1: 8 licenses - ['Permissive', 'Proprietary Free', 'Public Domain', 'Free Restricted', 'Copyleft Limited']...
  Cluster 2: 4 licenses - ['Proprietary Free', 'Commercial', 'Patent License', 'Permissive']
  Cluster 3: 5 licenses - ['Permissive', 'Free Restricted', 'Copyleft Limited', 'Copyleft', 'Proprietary 

## Comparison of All Approaches

In [13]:
# Summary of approaches
print("=" * 70)
print("SUMMARY OF ALL APPROACHES")
print("=" * 70)

approaches_summary = {
    'Original Baseline': {
        'samples': len(df),
        'samples_per_class': len(df) / df['category'].nunique(),
        'pros': ['Simple', 'Fast'],
        'cons': ['Only 1 sample per license', 'Severe class imbalance']
    },
    'Data Augmentation': {
        'samples': len(df_augmented),
        'samples_per_class': len(df_augmented) / df['category'].nunique(),
        'pros': ['Preserves semantic meaning', 'Creates variations'],
        'cons': ['May create noisy samples', 'Limited diversity']
    },
    'Sentence Splitting': {
        'samples': len(df_sentences),
        'samples_per_class': len(df_sentences) / df['category'].nunique(),
        'pros': ['More training samples', 'Uses actual text segments'],
        'cons': ['May lose context', 'Sentences may be too specific']
    },
    'Back Translation': {
        'samples': len(df_back_translated) if len(df_back_translated) > 0 else len(df) * 3,
        'samples_per_class': (len(df_back_translated) if len(df_back_translated) > 0 else len(df) * 3) / df['category'].nunique(),
        'pros': ['High-quality augmentation', 'Paraphrasing preserves meaning'],
        'cons': ['Computationally expensive', 'Requires translation models']
    },
    'Transfer Learning (BERT)': {
        'model': 'DistilBERT',
        'accuracy': f'{acc_transfer:.4f}' if 'acc_transfer' in dir() else 'N/A',
        'pros': ['Leverages pretrained knowledge', 'Handles long texts well'],
        'cons': ['Memory intensive', 'Slow inference']
    },
    'Clustering': {
        'n_clusters': optimal_clusters,
        'accuracy': f'{acc_clustering:.4f}',
        'pros': ['Reduces problem complexity', 'Groups similar licenses'],
        'cons': ['May lose license distinctions', 'Requires cluster tuning']
    }
}

for approach, details in approaches_summary.items():
    print(f"\n{approach}:")
    for key, value in details.items():
        print(f"  {key}: {value}")

print("\n" + "=" * 70)
print("RECOMMENDED COMBINATIONS:")
print("=" * 70)
print("""
1. **Best Overall**: Sentence Splitting + Transfer Learning
   - Creates many samples from sentence splitting
   - Uses BERT embeddings for semantic understanding
   
2. **Fastest**: Data Augmentation + TF-IDF + RandomForest
   - Quick to train and inference
   - Good baseline results
   
3. **Most Robust**: Sentence Splitting + Back Translation + Clustering
   - Best augmentation quality
   - Handles similar licenses well
   
4. **For Production**: Transfer Learning + Clustering
   - Single best accuracy
   - Interpretable clusters for new license classes
""")

SUMMARY OF ALL APPROACHES

Original Baseline:
  samples: 2614
  samples_per_class: 237.63636363636363
  pros: ['Simple', 'Fast']
  cons: ['Only 1 sample per license', 'Severe class imbalance']

Data Augmentation:
  samples: 12965
  samples_per_class: 1178.6363636363637
  pros: ['Preserves semantic meaning', 'Creates variations']
  cons: ['May create noisy samples', 'Limited diversity']

Sentence Splitting:
  samples: 85085
  samples_per_class: 7735.0
  pros: ['More training samples', 'Uses actual text segments']
  cons: ['May lose context', 'Sentences may be too specific']

Back Translation:
  samples: 20
  samples_per_class: 1.8181818181818181
  pros: ['High-quality augmentation', 'Paraphrasing preserves meaning']
  cons: ['Computationally expensive', 'Requires translation models']

Transfer Learning (BERT):
  model: DistilBERT
  accuracy: 0.4000
  pros: ['Leverages pretrained knowledge', 'Handles long texts well']
  cons: ['Memory intensive', 'Slow inference']

Clustering:
  n_cluste