In [55]:
import pandas as pd
from tqdm import tqdm
from lets_plot import *

LetsPlot.setup_html()
tqdm.pandas()

In [56]:
df = pd.read_parquet('../data/2_pro/cleaned_dataset.parquet')
df['texto_length'] = df.clean_text.apply(len)

In [58]:
# df = df[df['texto_length'] >= 500]

In [59]:
# text = df[df['texto_length'] <= 1000].sort_values('texto_length', ascending=False).head(10).reset_index().clean_text[0]

# # Split text into chunks of roughly 100 characters at word boundaries
# chunks = []
# current_chunk = ""
# for word in text.split():
#     if len(current_chunk) + len(word) + 1 <= 100:
#         current_chunk += " " + word if current_chunk else word
#     else:
#         chunks.append(current_chunk)
#         current_chunk = word

# if current_chunk:
#     chunks.append(current_chunk)

# # Print chunks with line breaks between them
# for chunk in chunks:
#     print(chunk + "\n")

In [64]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

import numpy as np

# Prepare feature matrix X (all columns except 'ramo_direito')
X = df.drop('ramo_direito', axis=1)

# Prepare target variable y (multi-label)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['ramo_direito'])

vectorizer = TfidfVectorizer(max_features=4000)
X = vectorizer.fit_transform(df['clean_text'])
label_names = mlb.classes_

print(f"Number of features: {X.shape[1]}")
print(f"Number of target labels: {len(label_names)}")
print(f"Target labels: {label_names}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = MultiOutputClassifier(LogisticRegression(C=1))
# model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))


model.fit(X_train, y_train)

# Predict on test set
y_pred_proba_list = [proba[:, 1] for proba in model.predict_proba(X_test)]
# Transpose to get the correct shape (samples, labels) instead of (labels, samples)
y_pred_proba = np.array(y_pred_proba_list).T

Number of features: 4000
Number of target labels: 22
Target labels: [' GRANDE IMPACTO E REPERCUSSÃO'
 'DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DIREITO PÚBLICO'
 'DIREITO AMBIENTAL' 'DIREITO ASSISTENCIAL' 'DIREITO CIVIL'
 'DIREITO DA CRIANÇA E DO ADOLESCENTE' 'DIREITO DA SAÚDE'
 'DIREITO DO CONSUMIDOR' 'DIREITO DO TRABALHO' 'DIREITO ELEITORAL'
 'DIREITO INTERNACIONAL' 'DIREITO MARÍTIMO' 'DIREITO PENAL'
 'DIREITO PENAL MILITAR' 'DIREITO PREVIDENCIÁRIO'
 'DIREITO PROCESSUAL CIVIL E DO TRABALHO' 'DIREITO PROCESSUAL PENAL'
 'DIREITO PROCESSUAL PENAL MILITAR' 'DIREITO TRIBUTÁRIO'
 'DIREITO À EDUCAÇÃO' 'QUESTÕES DE ALTA COMPLEXIDADE' 'REGISTROS PÚBLICOS']


In [41]:
# Convert probability predictions to binary predictions
# Get binary predictions directly
y_pred = model.predict(X_test)


# For rows with no predicted labels, add the most likely label
zero_label_rows = np.sum(y_pred, axis=1) == 0
if np.any(zero_label_rows):
    # Get probabilities for rows with no predictions
    probs_zero_rows = y_pred_proba[zero_label_rows]
    # Find index of highest probability label for each row
    most_likely_labels = np.argmax(probs_zero_rows, axis=1)
    # Set those labels to 1
    y_pred[zero_label_rows, most_likely_labels] = 1
# Double-check shapes
print(f"y_test shape: {y_test.shape}")
print(f"y_pred_proba shape: {y_pred_proba.shape}")


y_test shape: (1969, 22)
y_pred_proba shape: (1969, 22)


In [42]:
from sklearn.metrics import hamming_loss, accuracy_score, average_precision_score, precision_recall_fscore_support
# Evaluate the model
print("Model Evaluation Metrics:\n")
hamming = hamming_loss(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
# Calculate percentage of cases where at least one label was correct
at_least_one_correct = np.any((y_test == y_pred) & (y_test == 1), axis=1)
at_least_one_correct_pct = np.mean(at_least_one_correct) * 100
# Calculate false positive rate
false_positives = np.sum((y_test == 0) & (y_pred == 1))
total_negatives = np.sum(y_test == 0)
false_positive_rate = (false_positives / total_negatives) * 100
# Calculate average number of labels per instance
avg_labels_real = np.mean(np.sum(y_test, axis=1))
avg_labels_pred = np.mean(np.sum(y_pred, axis=1))



# Calculate PR AUC
pr_auc = average_precision_score(y_test, y_pred_proba, average='macro')

# Calculate sample-weighted F1 score
_, _, f1_sample, _ = precision_recall_fscore_support(y_test, y_pred, average='samples')

print(f"Hamming Loss: {hamming:.4f}")
print(f"Accuracy Score: {accuracy:.4f}")
print(f"PR AUC: {pr_auc:.4f}")
print(f"Sample F1: {f1_sample:.4f}")
print(f"At least one correct: {at_least_one_correct_pct:.4f}%")
print(f"False Positive Rate: {false_positive_rate:.4f}%")

print(f"\nAverage labels per instance:")
print(f"Real: {avg_labels_real:.2f}")
print(f"Predicted: {avg_labels_pred:.2f}")



Model Evaluation Metrics:

Hamming Loss: 0.0291
Accuracy Score: 0.6008
PR AUC: 0.6082
Sample F1: 0.7693
At least one correct: 86.8461%
False Positive Rate: 1.0383%

Average labels per instance:
Real: 1.41
Predicted: 1.19


In [43]:
# Classification report
metrics_data = []
total_samples = len(y_test)  # Get total number of samples
for i, label in enumerate(label_names):
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test[:, i], y_pred[:, i], average='binary', zero_division=0
    )
    count = np.sum(y_test[:, i])  # Count occurrences of this label in y_test
    weight = count / total_samples  # Normalize count by total samples
    metrics_data.append({
        'Label': label,
        'F1-score': f1,
        'Precision': precision,
        'Recall': recall, 
        'Count': count,  # Number of times this label appears in test set
        'Weight': weight  # Normalized weight based on frequency
    })

metrics_df = pd.DataFrame(metrics_data)
# Sort by support in descending order
metrics_df = metrics_df.sort_values('Count', ascending=False)
# print("\nClassification Report (per label):")
metrics_df_melted = pd.melt(
    metrics_df,
    id_vars=['Label', 'Count'],
    value_vars=['F1-score','Precision', 'Recall', 'Weight'],
    var_name='Metric',
    value_name='Value'
)
metrics_df_melted

(
    ggplot(metrics_df_melted, aes(x='Label', y='Value', fill='Metric'))
    + geom_bar(aes(), stat='identity', position='dodge')
    + ggsize(1500, 1000)
).show()

In [44]:
# import matplotlib.pyplot as plt
# # Plot distribution of number of labels per instance
# plt.figure(figsize=(10, 6))
real_label_counts = np.sum(y_test, axis=1)
pred_label_counts = np.sum(y_pred, axis=1)

distribution_n_labels = pd.DataFrame({
    'real_n_labels': real_label_counts,
    'pred_n_labels': pred_label_counts
})

(
    ggplot(distribution_n_labels)
    + geom_bar(aes(x='real_n_labels'), stat='count', position='dodge', alpha=0.7)
    + geom_bar(aes(x='pred_n_labels'), stat='count', fill='red', alpha=0.7, width=0.5)
    + labs(
        title='Distribution of Number of Labels',
        x='Number of Labels',
        y='Count'
    )
    + theme_minimal()
).show()


In [45]:
# Create confusion matrix for multilabel
confusion_matrix = np.zeros((len(label_names), len(label_names)))
for i, true_label in enumerate(label_names):
    true_mask = y_test[:, i] == 1
    true_count = np.sum(true_mask)  # Count of true occurrences for label i
    for j, pred_label in enumerate(label_names):
        # Count how many times label j was predicted when label i was true
        pred_count = np.sum(y_pred[true_mask, j])
        # Compute ratio of predicted to true counts, handling division by zero
        confusion_matrix[i, j] = pred_count / true_count if true_count > 0 else 0

# Create dataframe with label names as index and columns
confusion_df = pd.DataFrame(
    confusion_matrix,
    index=label_names,
    columns=label_names
)

# Add total column with sum of y_test for each label
confusion_df['total'] = np.sum(y_test, axis=0)

(
    ggplot(confusion_df.sort_values('total', ascending=True).drop('total', axis=1).reset_index().melt(id_vars='index'), 
           aes(y='index', x='variable', fill='value'))
    + geom_tile()
    + scale_fill_gradient(low='white', high='red')
    + theme(axis_text_x=element_text(angle=45, hjust=1))
    + labs(y='True Label', x='Predicted Label', fill='Count')
    + ggsize(1200, 1000)
).show()


Model Evaluation Metrics:

Hamming Loss: 0.0299

Accuracy Score: 0.5946

PR AUC: 0.4691

Sample F1: 0.7606

At least one correct: 86.0172%

False Positive Rate: 0.9976%

Average labels per instance:

Real: 1.41

Predicted: 1.16