In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import (
    hamming_loss,
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    average_precision_score
)
from lets_plot import *

LetsPlot.setup_html()# Increase the limit of rows display in the notebook
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 30)

In [26]:
df = pd.read_parquet('../data/1_raw/dataset_desafio_ramo_direito (1).parquet')

# Target Variable : `ramo_direito`

Let us start by checking the how many individual labels we have.

In [28]:
def get_unique_counts(df, column):
    values, counts = np.unique_counts(np.concat(df[column]))
    uq = pd.DataFrame({'label':values, 'count':counts})
    uq = uq.sort_values(by='count', ascending=False)
    return uq.reset_index(drop=True)
    
get_unique_counts(df,'ramo_direito')

def compute_percentages(df, column):
    uq = get_unique_counts(df, column)
    total = uq['count'].sum()
    uq['percentage'] = np.round((uq['count'] / total) * 100,2)
    return uq


labels = compute_percentages(df, 'ramo_direito')
print("TOTAL NUMBER OF INDIVIDUAL LABELS: " + str(labels.shape[0]))
labels

TOTAL NUMBER OF INDIVIDUAL LABELS: 22


Unnamed: 0,label,count,percentage
0,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,4179,29.96
1,DIREITO PROCESSUAL CIVIL E DO TRABALHO,2536,18.18
2,DIREITO TRIBUTÁRIO,1994,14.29
3,DIREITO CIVIL,1418,10.16
4,DIREITO PROCESSUAL PENAL,1154,8.27
5,DIREITO PENAL,855,6.13
6,DIREITO PREVIDENCIÁRIO,545,3.91
7,DIREITO DO CONSUMIDOR,427,3.06
8,DIREITO DO TRABALHO,337,2.42
9,DIREITO DA SAÚDE,147,1.05


There is a great imbalance. Some classes are very rare, which will make predicting them quite hard using regular ML algorithms.
A better strategy for these classes with low occurences will be to use some statistics techniques for text similary, or some key words.

We will need to investigate them individually.

In [29]:
df['n_labels'] = df['ramo_direito'].apply(len)

In [30]:
df['labels'] = df['ramo_direito'].apply(set)

In [31]:
multilabels = pd.DataFrame(df['labels'].value_counts()).reset_index()
multilabels['n_labels'] = multilabels['labels'].apply(len)
multilabels['percentage'] = np.round(multilabels['count']/multilabels['count'].sum()*100,2)

multilabels

Unnamed: 0,labels,count,n_labels,percentage
0,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,2644,1,26.7
1,{DIREITO TRIBUTÁRIO},1181,1,11.93
2,{DIREITO CIVIL},642,1,6.48
3,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,625,2,6.31
4,"{DIREITO PROCESSUAL PENAL, DIREITO PENAL}",570,2,5.76
5,{DIREITO PROCESSUAL CIVIL E DO TRABALHO},548,1,5.53
6,{DIREITO PROCESSUAL PENAL},520,1,5.25
7,{DIREITO PREVIDENCIÁRIO},348,1,3.51
8,"{DIREITO PROCESSUAL CIVIL E DO TRABALHO, DIREI...",296,2,2.99
9,{DIREITO DO TRABALHO},251,1,2.53


In [32]:
(
    ggplot(multilabels, aes(x='labels', y='count')) 
    + geom_bar(stat='identity')
    + theme(axis_text_x=element_text(angle=90, hjust=1))
    + scale_y_log10()
    + ggsize(1500, 500)
)

In [33]:
n_labels_dist = pd.DataFrame(df.n_labels.value_counts()).reset_index()
n_labels_dist['percentage'] = np.round((n_labels_dist['count'] / n_labels_dist['count'].sum()) * 100,2)
n_labels_dist


Unnamed: 0,n_labels,count,percentage
0,1,6742,68.08
1,2,2436,24.6
2,3,586,5.92
3,4,120,1.21
4,5,17,0.17
5,7,1,0.01
6,6,1,0.01


## Baseline

Note that our data has mostly a single label, followed by two labels. The rest compose less than 10% of the data.
This means that the simplest model just labels all the data with  `'DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DIREITO PÚBLICO'`.

In [34]:
mlb = MultiLabelBinarizer()
encoded_labels = mlb.fit_transform(df['ramo_direito'])
df = pd.concat([df,pd.DataFrame(encoded_labels, columns=mlb.classes_, index=df.index)],axis=1)

In [35]:
y_true = df.apply(lambda row: list(row[mlb.classes_].values), axis=1).tolist()

Let us evalute our baseline model. The baseline simply predicts the most common label to every sample.

In order to have a better feel for how performance varies, we are also going to create some lesser baselines
using the second and third most common labels. We are then going to see how these two vary in terms of performance.

In [36]:
multilabels

Unnamed: 0,labels,count,n_labels,percentage
0,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,2644,1,26.7
1,{DIREITO TRIBUTÁRIO},1181,1,11.93
2,{DIREITO CIVIL},642,1,6.48
3,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,625,2,6.31
4,"{DIREITO PROCESSUAL PENAL, DIREITO PENAL}",570,2,5.76
5,{DIREITO PROCESSUAL CIVIL E DO TRABALHO},548,1,5.53
6,{DIREITO PROCESSUAL PENAL},520,1,5.25
7,{DIREITO PREVIDENCIÁRIO},348,1,3.51
8,"{DIREITO PROCESSUAL CIVIL E DO TRABALHO, DIREI...",296,2,2.99
9,{DIREITO DO TRABALHO},251,1,2.53


In [37]:
multilabels[multilabels['n_labels'] > 1]

Unnamed: 0,labels,count,n_labels,percentage
3,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,625,2,6.31
4,"{DIREITO PROCESSUAL PENAL, DIREITO PENAL}",570,2,5.76
8,"{DIREITO PROCESSUAL CIVIL E DO TRABALHO, DIREI...",296,2,2.99
11,"{DIREITO PROCESSUAL CIVIL E DO TRABALHO, DIREI...",242,2,2.44
12,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,167,3,1.69
13,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,144,2,1.45
16,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,97,3,0.98
17,{DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE D...,82,2,0.83
18,"{DIREITO DO CONSUMIDOR, DIREITO PROCESSUAL CIV...",78,3,0.79
19,"{DIREITO PROCESSUAL CIVIL E DO TRABALHO, DIREI...",70,2,0.71


In [38]:
label1 = 'DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DIREITO PÚBLICO'
label2 = 'DIREITO PROCESSUAL CIVIL E DO TRABALHO'
label3 = 'DIREITO TRIBUTÁRIO'
baseline_multi_label = mlb.transform([{label1}])[0]
baseline_multi_label_second = mlb.transform([{label2}])[0]
baseline_multi_label_third = mlb.transform([{label3}])[0]

baseline_multi_label

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [39]:
y_prob = [baseline_multi_label] * df.shape[0]
y_prob_second = [baseline_multi_label_second] * df.shape[0]
y_prob_third = [baseline_multi_label_third] * df.shape[0]

threshold = 0.5
y_pred = [[1 if prob >= threshold else 0 for prob in probs] for probs in y_prob]
y_pred_second = [[1 if prob >= threshold else 0 for prob in probs] for probs in y_prob_second]
y_pred_third = [[1 if prob >= threshold else 0 for prob in probs] for probs in y_prob_third]

In [40]:
np.sum(y_true, axis=0)

array([  23, 4179,  138,   14, 1418,   12,  147,  427,  337,   22,   12,
          3,  855,   19,  545, 2536, 1154,    5, 1994,   79,   23,    8])

In [41]:
np.sum(y_pred, axis=0)

array([   0, 9903,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [42]:
np.sum(y_pred_second, axis=0)

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 9903,    0,    0,    0,    0,    0,    0])

In [43]:
np.sum(y_pred_third, axis=0)

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0, 9903,    0,    0,    0])

## Evaluation
Let us know use distinct metrics to get see how these baseline models perform. From there, we can start to
evaluate how our actual ML models improve the results.


#### Hamming Loss
To evaluate multi-labels we start using the `hamming_loss` metric, which is the fraction of wrong labels (extra or missing) to the total number of labels. 
$$
\mathcal{L}_{\text{Hamming}} = \frac{1}{n} \sum_{i=1}^{n} \frac{1}{L} \sum_{j=1}^{L} \mathbb{1}(y_{ij} \neq \hat{y}_{ij})
$$



In [44]:
# 1. Hamming Loss
hl = hamming_loss(y_true, y_pred)
print(f"Hamming Loss - Baseline: {hl:.4f}")

hl = hamming_loss(y_true, y_pred_second)
print(f"Hamming Loss - Baseline Second: {hl:.4f}")

hl = hamming_loss(y_true, y_pred_third)
print(f"Hamming Loss - Baseline Third: {hl:.4f}")

Hamming Loss - Baseline: 0.0711
Hamming Loss - Baseline Second: 0.0862
Hamming Loss - Baseline Third: 0.0912


Note that the Hamming Loss seems very small, even for the second baseline. The reason is that we have too many labels.
Since most labels are fairly uncommon, the Hamming Loss is not very informative, because it "assumes" that the
rare labels are equally important.

#### Subset Accuracy

Our second metric is the subset accuracy, which is the fraction of samples that have all their labels predicted correctly.
Note that this is a very "strict" metric, as it requires all labels to be predicted correctly. Thus, it is informative,
but, does not tell the whole story.

**IMPORTANT**

Note that although `'DIREITO PROCESSUAL CIVIL E DO TRABALHO'` is the second most common label,
it tends to appear together with other labels. Hence, individually, `'DIREITO TRIBUTÁRIO'` will have better
subset accuracy.

In [45]:
# 2. Subset Accuracy (Exact Match)
subset_acc = accuracy_score(y_true, y_pred)
print(f"Subset Accuracy (Exact Match): {subset_acc:.4f}")

subset_acc = accuracy_score(y_true, y_pred_second)
print(f"Subset Accuracy - Second (Exact Match): {subset_acc:.4f}")

subset_acc = accuracy_score(y_true, y_pred_third)
print(f"Subset Accuracy - Third (Exact Match): {subset_acc:.4f}")

Subset Accuracy (Exact Match): 0.2670
Subset Accuracy - Second (Exact Match): 0.0553
Subset Accuracy - Third (Exact Match): 0.1193


This time, the difference between the baseline and second baseline is much more pronounced. 
Also, note that the third baseline has a much better accuracy than the second baseline.

This highlights how the different metrics capture different aspects. The Hamming Loss for the third is the worst,
while the subset accuracy is worst for the second.

#### F1 Scores

**Precision:**
Precision measures the proportion of correctly predicted positive instances out of all instances predicted as positive.
$$
\text{Precision} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Positives (FP)}}
$$


**Recall:**
Measures the proportion of correctly predicted positive instances out of all actual positive instances.
$$
\text{Recall} = \frac{\text{True Positives (TP)}}{\text{True Positives (TP)} + \text{False Negatives (FN)}}
$$

**F1 score** is the harmonic mean of precision and recall for a single class:

$$
F1 = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
$$



**Micro-F1:**
Micro-F1 aggregates the contributions of all classes to compute the average F1 score. It is calculated globally by counting the total true positives, false negatives, and false positives.
$$
\text{Micro-F1} = 2 \cdot \frac{\text{Precision}_{\text{micro}} \cdot \text{Recall}_{\text{micro}}}{\text{Precision}_{\text{micro}} + \text{Recall}_{\text{micro}}}
$$

**Macro-F1:**
Macro-F1 calculates the F1 score for each class independently and then takes the average. It treats all classes equally, regardless of their frequency.
$$
\text{Macro-F1} = \frac{1}{C} \sum_{i=1}^{C} F1_i
$$
where $C$ is the number of classes and $F1_i$ is the F1 score for class $i$.

**Weighted-F1:**
Weighted-F1 calculates the F1 score for each class and then takes the weighted average, where the weights are the number of true instances for each class. This accounts for class imbalance.
$$
\text{Weighted-F1} = \frac{\sum_{i=1}^{C} w_i \cdot F1_i}{\sum_{i=1}^{C} w_i}
$$
where $w_i$ is the number of true instances for class $i$

#### Samples-F1

The Samples-F1 metric calculates the F1 score for each individual sample and then averages these scores across all samples.
This metric is particularly useful in multi-label classification tasks, as it evaluates the performance on a per-sample basis,
taking into account the specific set of labels predicted for each sample.

**Formula:**
$$
\text{Samples-F1} = \frac{1}{n} \sum_{i=1}^{n} F1_i
$$
where $F1_i$ is the F1 score for the $i$-th sample, and $n$ is the total number of samples.

**Key Points:**
- Samples-F1 is sensitive to the performance on individual samples, making it a good choice when the label sets vary significantly across samples.
- It provides a more granular view of the model's performance compared to global metrics like Micro-F1 or Macro-F1.
- It is particularly useful when the number of labels per sample is small or varies widely.

---------------------
First, let us calculate the Precision, Recall and F1-Score for each label.

In [46]:
label_metrics = []
for idx, label in enumerate(mlb.classes_):
    for baseline_name, y_pred_baseline in zip(
        ['Baseline', 'Second Baseline', 'Third Baseline'], 
        [y_pred, y_pred_second, y_pred_third]
    ):
        precision = precision_score([y[idx] for y in y_true], [y[idx] for y in y_pred_baseline], average='binary', zero_division=0)
        recall = recall_score([y[idx] for y in y_true], [y[idx] for y in y_pred_baseline], average='binary', zero_division=0)
        f1 = f1_score([y[idx] for y in y_true], [y[idx] for y in y_pred_baseline], average='binary', zero_division=0)
        label_metrics.append({
            'Label': label, 
            'Baseline': baseline_name, 
            'Precision': precision, 
            'Recall': recall, 
            'F1-Score': f1
        })

label_metrics_df = pd.DataFrame(label_metrics)
label_metrics_pivot = label_metrics_df.pivot_table(
    index='Label', 
    columns='Baseline', 
    values=['Precision', 'Recall', 'F1-Score']
)
label_metrics_pivot.loc[[
    label1,
    label2,
    label3,
]]

Unnamed: 0_level_0,F1-Score,F1-Score,F1-Score,Precision,Precision,Precision,Recall,Recall,Recall
Baseline,Baseline,Second Baseline,Third Baseline,Baseline,Second Baseline,Third Baseline,Baseline,Second Baseline,Third Baseline
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DIREITO PÚBLICO,0.593524,0.0,0.0,0.421993,0.0,0.0,1.0,0.0,0.0
DIREITO PROCESSUAL CIVIL E DO TRABALHO,0.0,0.40775,0.0,0.0,0.256084,0.0,0.0,1.0,0.0
DIREITO TRIBUTÁRIO,0.0,0.0,0.335211,0.0,0.0,0.201353,0.0,0.0,1.0


In [47]:
# Create a DataFrame to compare the metrics for the three baselines
f1_table = pd.DataFrame({
    'Metric': ['Micro-Precision','Micro-Recall','Micro-F1', 'Macro-F1', 'Weighted-F1'],
    'Baseline': [
        precision_score(y_true,y_pred, average='micro'),
        recall_score(y_true,y_pred, average='micro'),
        f1_score(y_true, y_pred, average='micro'),
        f1_score(y_true, y_pred, average='macro'),
        f1_score(y_true, y_pred, average='weighted')
    ],
    'Second Baseline': [
        precision_score(y_true,y_pred_second, average='micro'),
        recall_score(y_true,y_pred_second, average='micro'),
        f1_score(y_true, y_pred_second, average='micro'),
        f1_score(y_true, y_pred_second, average='macro'),
        f1_score(y_true, y_pred_second, average='weighted')
    ],
    'Third Baseline': [
        precision_score(y_true,y_pred_third, average='micro'),
        recall_score(y_true,y_pred_third, average='micro'),
        f1_score(y_true, y_pred_third, average='micro'),
        f1_score(y_true, y_pred_third, average='macro'),
        f1_score(y_true, y_pred_third, average='weighted')
    ]
})

f1_table

Unnamed: 0,Metric,Baseline,Second Baseline,Third Baseline
0,Micro-Precision,0.421993,0.256084,0.201353
1,Micro-Recall,0.29957,0.181792,0.142939
2,Micro-F1,0.350396,0.212636,0.167191
3,Macro-F1,0.026978,0.018534,0.015237
4,Weighted-F1,0.177802,0.074126,0.047915


#### PR AUC (Precision Recall Area Under the Curve)

If our model returns probabilities, we can instead evaluate it using the PR AUC.
Note that, if we want to compare the predictive models among themselves, teh PR AUC will be useful.
BUT, the final model evaluation is better to use the other metrics, as our model will define a probability threshold
to predict the labels, since this is the final goal, and not the probability.

In [48]:
pr_table = pd.DataFrame({
    'Metric': ['PR AUC'],
    'Baseline': [
        average_precision_score(y_true, y_prob, average='samples'),
    ],
    'Second Baseline': [
        average_precision_score(y_true, y_prob_second, average='samples'),
    ],
    'Third Baseline': [
        average_precision_score(y_true, y_prob_third, average='samples'),
    ]
})

pr_table

Unnamed: 0,Metric,Baseline,Second Baseline,Third Baseline
0,PR AUC,0.379821,0.195558,0.208107


#### Summary

Let us put all the macro metrics in a table.

In [49]:
metric_table = pd.DataFrame({
    'Metric': ['Hamming Loss', 'Subset Accuracy','PR AUC','Micro-Precision','Micro-Recall','Micro-F1', 'Macro-F1', 'Weighted-F1','Samples-F1'],
    'Baseline': [
        hamming_loss(y_true, y_pred),
        accuracy_score(y_true, y_pred),
        average_precision_score(y_true, y_prob, average='samples'),
        precision_score(y_true,y_pred, average='micro'),
        recall_score(y_true,y_pred, average='micro'),
        f1_score(y_true, y_pred, average='micro'),
        f1_score(y_true, y_pred, average='macro'),
        f1_score(y_true, y_pred, average='weighted'),
        f1_score(y_true, y_pred, average='samples')
    ],
    'Second Baseline': [
        hamming_loss(y_true, y_pred_second),
        accuracy_score(y_true, y_pred_second),
        average_precision_score(y_true, y_prob_second, average='samples'),
        precision_score(y_true,y_pred_second, average='micro'),
        recall_score(y_true,y_pred_second, average='micro'),
        f1_score(y_true, y_pred_second, average='micro'),
        f1_score(y_true, y_pred_second, average='macro'),
        f1_score(y_true, y_pred_second, average='weighted'),
        f1_score(y_true, y_pred_second, average='samples')
    ],
    'Third Baseline': [
        hamming_loss(y_true, y_pred_third),
        accuracy_score(y_true, y_pred_third),
        average_precision_score(y_true, y_prob_third, average='samples'),
        precision_score(y_true,y_pred_third, average='micro'),
        recall_score(y_true,y_pred_third, average='micro'),
        f1_score(y_true, y_pred_third, average='micro'),
        f1_score(y_true, y_pred_third, average='macro'),
        f1_score(y_true, y_pred_third, average='weighted'),
        f1_score(y_true, y_pred_third, average='samples'),
    ]
})

metric_table

Unnamed: 0,Metric,Baseline,Second Baseline,Third Baseline
0,Hamming Loss,0.071122,0.086204,0.09118
1,Subset Accuracy,0.26699,0.055337,0.119257
2,PR AUC,0.379821,0.195558,0.208107
3,Micro-Precision,0.421993,0.256084,0.201353
4,Micro-Recall,0.29957,0.181792,0.142939
5,Micro-F1,0.350396,0.212636,0.167191
6,Macro-F1,0.026978,0.018534,0.015237
7,Weighted-F1,0.177802,0.074126,0.047915
8,Samples-F1,0.360591,0.176361,0.166704


### Correlation Between Labels

There are co-occurences between the labels. This means that some labels are more likely to appear together than others.
Let us explore how strong is this behavior.

In [50]:
# One-hot label matrix
label_matrix = df[mlb.classes_].values

# Compute co-occurrence matrix
co_matrix = np.dot(label_matrix.T, label_matrix)
np.fill_diagonal(co_matrix, 0)

# Convert to long-form DataFrame for lets-plot
cooc_df = pd.DataFrame(co_matrix, index=mlb.classes_, columns=mlb.classes_)
cooc_df = cooc_df.reset_index().melt(id_vars='index', var_name='label_b', value_name='count')
cooc_df = cooc_df.rename(columns={'index': 'label_a'})



cooc_df['labels'] = cooc_df.apply(lambda row: "_".join(np.sort([row['label_a'],row['label_b']])), axis=1)
# cooc_df = cooc_df.drop_duplicates(subset=['labels'])
cooc_df = cooc_df.sort_values(by='count',ascending=False)

cooc_df.head(20)

Unnamed: 0,label_a,label_b,count,labels
331,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,DIREITO PROCESSUAL CIVIL E DO TRABALHO,1088,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...
37,DIREITO PROCESSUAL CIVIL E DO TRABALHO,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,1088,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...
364,DIREITO PENAL,DIREITO PROCESSUAL PENAL,593,DIREITO PENAL_DIREITO PROCESSUAL PENAL
280,DIREITO PROCESSUAL PENAL,DIREITO PENAL,593,DIREITO PENAL_DIREITO PROCESSUAL PENAL
411,DIREITO PROCESSUAL CIVIL E DO TRABALHO,DIREITO TRIBUTÁRIO,589,DIREITO PROCESSUAL CIVIL E DO TRABALHO_DIREITO...
348,DIREITO TRIBUTÁRIO,DIREITO PROCESSUAL CIVIL E DO TRABALHO,589,DIREITO PROCESSUAL CIVIL E DO TRABALHO_DIREITO...
103,DIREITO PROCESSUAL CIVIL E DO TRABALHO,DIREITO CIVIL,576,DIREITO CIVIL_DIREITO PROCESSUAL CIVIL E DO TR...
334,DIREITO CIVIL,DIREITO PROCESSUAL CIVIL E DO TRABALHO,576,DIREITO CIVIL_DIREITO PROCESSUAL CIVIL E DO TR...
397,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,DIREITO TRIBUTÁRIO,396,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...
40,DIREITO TRIBUTÁRIO,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,396,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...


In [51]:
cooc_df = pd.merge(cooc_df,labels[['label','count']],left_on='label_a', right_on='label', how='inner')
cooc_df = pd.merge(cooc_df,labels[['label','count']],left_on='label_b', right_on='label', how='inner')
cooc_df = cooc_df.rename(columns={'count_x':'count_pair','count_y':'count_a','count':'count_b'}).drop(columns=['label_x','label_y'])
cooc_df['p(a,b)'] = cooc_df['count_pair'] / df.shape[0]
cooc_df['p(a)'] = cooc_df['count_a'] / df.shape[0]
cooc_df['p(b)'] = cooc_df['count_a'] / df.shape[0]
cooc_df['p(a,b|a)'] = cooc_df['count_pair'] / cooc_df['count_b']
cooc_df['pmi'] = np.log2((cooc_df['p(a,b)'] + 1e-10) / (cooc_df['p(a)'] * cooc_df['p(b)']))
cooc_df['pmi_abs'] = np.abs(cooc_df['pmi'])

In [52]:
cooc_df.sort_values(by='pmi',ascending=False)
# cooc_df

Unnamed: 0,label_a,label_b,count_pair,labels,count_a,count_b,"p(a,b)",p(a),p(b),"p(a,b|a)",pmi,pmi_abs
107,DIREITO MARÍTIMO,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,3,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,3,4179,0.000303,0.000303,0.000303,0.000718,11.688688,11.688688
154,DIREITO MARÍTIMO,DIREITO CIVIL,1,DIREITO CIVIL_DIREITO MARÍTIMO,3,1418,0.000101,0.000303,0.000303,0.000705,10.103726,10.103726
113,DIREITO PROCESSUAL PENAL MILITAR,DIREITO PROCESSUAL PENAL,2,DIREITO PROCESSUAL PENAL_DIREITO PROCESSUAL PE...,5,1154,0.000202,0.000505,0.000505,0.001733,9.629794,9.629794
120,DIREITO PROCESSUAL PENAL MILITAR,DIREITO PENAL MILITAR,2,DIREITO PENAL MILITAR_DIREITO PROCESSUAL PENAL...,5,19,0.000202,0.000505,0.000505,0.105263,9.629794,9.629794
49,QUESTÕES DE ALTA COMPLEXIDADE,GRANDE IMPACTO E REPERCUSSÃO,23,GRANDE IMPACTO E REPERCUSSÃO_QUESTÕES DE ALTA...,23,23,0.002323,0.002323,0.002323,1.000000,8.750088,8.750088
...,...,...,...,...,...,...,...,...,...,...,...,...
197,DIREITO PROCESSUAL CIVIL E DO TRABALHO,DIREITO PENAL MILITAR,0,DIREITO PENAL MILITAR_DIREITO PROCESSUAL CIVIL...,2536,19,0.000000,0.256084,0.256084,0.000000,-29.288659,29.288659
273,DIREITO PROCESSUAL CIVIL E DO TRABALHO,DIREITO PROCESSUAL PENAL MILITAR,0,DIREITO PROCESSUAL CIVIL E DO TRABALHO_DIREITO...,2536,5,0.000000,0.256084,0.256084,0.000000,-29.288659,29.288659
348,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,0,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,4179,4179,0.000000,0.421993,0.421993,0.000000,-30.729865,30.729865
418,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,DIREITO PENAL MILITAR,0,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,4179,19,0.000000,0.421993,0.421993,0.000000,-30.729865,30.729865


## **CONCLUSION**

Our goal with our model is **to predict at least one label correctly, while avoiding predicting incorrect labels**.
Thus, the best metric that captures this is the F1 Samples. Why? Because the F1 is an average of precision and recall,
yet, the `sample` version computes this for each multi-label instance, and then we average for the whole dataset.
Note that if no labels are predicted correctly in an instance, then its F1 becomes 0, because the recall is 0.
On the other hand, false positives degrade the precision.

# Features : `texto_bruto`

Next, let us analyze the actual texts.

In [53]:
df.texto_bruto[0]

'ESTADO DP SEECITT\nPREFRITLRA MIUNICICAL SE &xACAÍU\nPROCLRADURUX CAXALOÓINQ MIiUHCITIG\n\n"\n\nEXCELENTÍSSIMO SENHOR DOUTOR DESEMBARGADOR PRESIDENTE DO\n\nTRIBUNAL DE JUSTICA DO ESTADO DE SERGIPE\n\nProcesso n.? 2009203895\nApelação n.º 2105/2009\nAcórdão 13080/2009\n\new Gera ex Jupe 35. CICER. CESDNC\n\nO MUNICÍPIO DE ARACAJU, qualificado nos autos, por seu\nProcurador in fine firmado, nos autos da Apelação Cível em epígrafe, interposta em face\n\nde CARLOS AUGUSTO DE ALBUQUERQUE MORAES, igualmente identificado, vem\ninterpor, perante V. Ex*, tempestivamente, RECURSO EXTRAORDINÁRIO, a fim de\n\ncorrigir as ofensas dirigidas à Constituição Federal, no seu art. 2º; art. 18, 84º; art. 156, I; e\nao art. 96 do ADCT, com fulcro nos arts. 102, inciso IIl, da CF/88, devendo as razões\nanexas ser encaminhadas para apreciação do Colendo Supremo Tribunal Federal.\n\nNestes termos\nPede Deferimento.\n\nAracaju-SE, 22 de março de 2010.\n\n4L.\nAMILTON NEVES BRITO FILHO\nartador do Municipio de

In [54]:
df['len_texto_bruto'] = df['texto_bruto'].apply(len)
df['log_len_texto_bruto'] = np.log10(df['len_texto_bruto'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [55]:
# Calculate IQR
Q1 = df.log_len_texto_bruto.quantile(0.25)
Q3 = df.log_len_texto_bruto.quantile(0.75)
IQR = Q3 - Q1

# Define bounds
log_lower_bound_iqr = Q1 - 2.0 * IQR
log_upper_bound_iqr = Q3 + 2.0 * IQR
lower_bound_iqr = 10**log_lower_bound_iqr
upper_bound_iqr = 10**log_upper_bound_iqr

# Calculate z-scores
df['z_score'] = (df.log_len_texto_bruto - df.log_len_texto_bruto.mean()) / df.log_len_texto_bruto.std()

# Identify outliers
df['is_outlier_iqr'] = (df.log_len_texto_bruto < log_lower_bound_iqr) | (df.log_len_texto_bruto > log_upper_bound_iqr)

# Print summary
print(f"Number of outliers (IQR method): {df.is_outlier_iqr.sum()}")
print(f"Lower bound: {lower_bound_iqr}")
print(f"Upper bound: {upper_bound_iqr}")

# Show some examples of outliers
print("\nExamples of outliers:")
print(df[df.is_outlier_iqr].sort_values('log_len_texto_bruto', ascending=False)[['log_len_texto_bruto', 'ramo_direito']].head())

# Plot distribution with outliers highlighted
p1 = (
    ggplot(data=df)
    + geom_histogram(aes(x='len_texto_bruto', fill='is_outlier_iqr'), bins=100)
    + scale_x_log10()
    + labs(title="Distribution of Text Lengths with Outliers Highlighted")
)

p1.show()

  sqr = _ensure_numeric((avg - values) ** 2)


Number of outliers (IQR method): 145
Lower bound: 4151.152529155984
Upper bound: 208059.2138038047

Examples of outliers:
      log_len_texto_bruto                                       ramo_direito
1336             5.709093                               [DIREITO À EDUCAÇÃO]
3969             5.632022           [DIREITO PROCESSUAL CIVIL E DO TRABALHO]
3048             5.524785  [DIREITO PROCESSUAL CIVIL E DO TRABALHO, DIREI...
6809             5.487851          [DIREITO PENAL, DIREITO PROCESSUAL PENAL]
2197             5.478822                               [DIREITO À EDUCAÇÃO]


In [56]:
label_len_df = pd.DataFrame(
    {label: np.median(df[df[label] == 1]['len_texto_bruto']) for label in labels.label.to_list()},
    index=[0]
).T.reset_index().rename(columns={'index':'label',0:'median_len'})

label_len_df = pd.merge(labels, label_len_df, left_on='label', right_on='label', how='inner')

In [57]:
(
    ggplot(label_len_df, aes(x='label', y='median_len')) 
    + geom_bar(stat='identity')
    + ggsize(1500, 1000)
)

In [58]:
small_text_labels = compute_percentages(df[df['len_texto_bruto']< lower_bound_iqr].reset_index(drop=True), 'ramo_direito')
small_text_labels

Unnamed: 0,label,count,percentage
0,DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DI...,34,26.98
1,DIREITO TRIBUTÁRIO,21,16.67
2,DIREITO PROCESSUAL CIVIL E DO TRABALHO,18,14.29
3,DIREITO CIVIL,16,12.7
4,DIREITO PREVIDENCIÁRIO,13,10.32
5,DIREITO PROCESSUAL PENAL,11,8.73
6,DIREITO PENAL,5,3.97
7,DIREITO DO CONSUMIDOR,4,3.17
8,DIREITO DO TRABALHO,4,3.17


In [59]:
df[df['len_texto_bruto']< lower_bound_iqr].shape

(94, 30)

In [60]:
df[df['len_texto_bruto']< lower_bound_iqr].sort_values(by='len_texto_bruto', ascending=False).reset_index().texto_bruto.head(90)

0     Pio\n\no Taíse Cândido Nunes - Advogada\n\n: N...
1     Este documento é cópia do original assinado di...
2     EXCELENTÍSSIMO SENHOR DESEMBARGADOR\nPRESIDENT...
3     | fi\nPROCURADORIA-GERAL DO DISTRITO FEDERAL I...
4     Mene a AME\n\nJl y PAULO PADILHA - ADVOCACIA\n...
5     (e-STJ Fl.447)\n\nEXMO. SR. DR. DESEMBARGADOR ...
6     EXCELENTÍSSIMO SENHOR DESEMBARGADOR VICE PRESI...
7     Assinado eletronicamente por: ALINE ALVES DE L...
8     Assinado eletronicamente por: JULIANA BORGES D...
9     EXMO. SR. DR. DESEMBARGADOR PRESIDENTE DO\nEGR...
10    AJA O ANA VUV2UJITAC.CVEUT.O.LU.VVLZ2Z n" hel,...
11    fis. 313\n\nEXCELENTÍSSIMO SENHOR DOUTOR PRESI...
12    fis. 139\n\nSiqueira Castro Advogados a] *\nPr...
13    Este documento é cópia do original assinado di...
14    EXCELENTÍSSIMO SENHOR DOUTOR J UIZ PRESIDENTE ...
15    AO JUIZO DA 4º TURMA RECURSAL FEDERAL DE SÃO P...
16    EXCELENTÍSSIMO SENHOR PRESIDENTE DO TRIBUNAL R...
17    ru 4i n i . : LEONARDO SOUZA : e\nvd JEN E

## Sample Data
Let us extract a sample of the dataframe to use for testing.


In [72]:
# from app.main import app
import sys
import os

# Add the project root directory t
project_root = os.path.abspath('..')  # Adjust this path based on your notebook location
sys.path.append(project_root)
from app.text_processing import clean_text
# For each label, sample one row that has that label
samples = []
for label in labels['label']:
    # Get rows with this label
    label_rows = df[df[label] == 1]
    if len(label_rows) > 0:
        # Sample one row with fixed seed for reproducibility
        sample = label_rows.sample(n=1, random_state=3)
        samples.append(sample)
    
# Combine all samples into one dataframe
sample_df = pd.concat(samples, axis=0)
sample_df= sample_df[['texto_bruto','ramo_direito']]
sample_df['clean_text'] = sample_df['texto_bruto'].apply(clean_text)

In [74]:
sample_df.to_parquet('../tests/sample_data/sample_data.parquet')