# With custom list

In [2]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

df_final = pd.read_csv('Results_with_list.csv')

import nltk
import re

def extract_words(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words]
    return words

def is_list_of_words(text):
    words = extract_words(text)
    return all(word.isalpha() for word in words)

results_bode = []
results_vlt5 = []
results_google = []
results_bloom = []
results_face = []
results_your = []
default_text = []

for text in df_final['Bode'].tolist():
    results_bode.append(list(set(extract_words(text))))

for text in df_final['vlt5'].tolist():
    results_vlt5.append(list(set(extract_words(text))))

for text in df_final['Google'].tolist():
    results_google.append(list(set(extract_words(text))))

for text in df_final['BigScience'].tolist():
    results_bloom.append(list(set(extract_words(text))))

for text in df_final['Facebook'].tolist():
    results_face.append(list(set(extract_words(text))))

for text in df_final['your list'].tolist():
    results_your.append(list(set(extract_words(text))))

for text in df_final['aspect'].tolist():
    default_text.append(list(set(extract_words(text))))

results_of_all = []
results_of_all.extend(df_final['aspect'].tolist()+df_final['your list'].tolist()+results_bode+results_vlt5+results_google+results_bloom+results_face)

results_of_all = [item for sublist in results_of_all for item in sublist]
unique_elements_results = list(set(results_of_all))

resultados = {
    'your list': results_your,
    'bode': results_bode,
    'vlt5': results_vlt5,
    'google': results_google,
    'bloom': results_bloom,
    'face': results_face
}

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

from collections import Counter
import math

def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

import numpy as np
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# Criar um binarizador com todas as classes
#mlb = MultiLabelBinarizer(classes=unique_elements_results)
mlb = MultiLabelBinarizer()
#mlb.fit(unique_elements_results)
y_true_binarized = mlb.fit_transform(default_text)

columns = ['your list', 'bode', 'vlt5', 'google', 'bloom', 'face']

for column in columns:
    jaccard_results = []
    cosine_results = []
    print(f'Cálculo da coluna {column}:')
    # Binarizar as etiquetas
    y_pred_binarized = mlb.transform(resultados[column])

    # Calcular Hamming Loss
    #hl = hamming_loss(y_true_binarized, y_pred_binarized)
    #print(f'Hamming Loss: {hl}')

    for x in range(len(default_text)):
        jaccard_results.append(jaccard_similarity(default_text[x], resultados[column][x]))
        if resultados[column][x] == []:
            resultados[column][x].append('nan')
        counterA = Counter(default_text[x])
        counterB = Counter(resultados[column][x])
        cosine_results.append(counter_cosine_similarity(counterA, counterB))
        
    print(f'Jaccard similarity: {sum(jaccard_results)/len(jaccard_results)}')
    print(f'Cosine similarity: {sum(cosine_results)/len(cosine_results)}')
    
    print(classification_report(y_true_binarized, y_pred_binarized))
    print('----------------------------------------------')

Cálculo da coluna your list:
Jaccard similarity: 0.7166386300897171
Cosine similarity: 0.8268663552215036
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       0.33      1.00      0.50         1
           2       0.89      0.57      0.70        14
           3       1.00      1.00      1.00        38
           4       0.50      1.00      0.67         3
           5       0.50      0.50      0.50         2
           6       1.00      1.00      1.00         1
           7       0.48      0.71      0.57        17
           8       1.00      1.00      1.00         4
           9       0.29      1.00      0.44         2
          10       0.80      1.00      0.89         4
          11       1.00      1.00      1.00        12
          12       0.60      1.00      0.75         9
          13       1.00      1.00      1.00         3
          14       0.62      1.00      0.77         5
          15       1.00      

# Without custom list:

In [11]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

df_final = pd.read_csv('Results_without_list.csv')

import nltk
import re

# Função para extrair palavras de um texto
def extract_words(text):
    # Remover pontuação
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenizar
    words = nltk.word_tokenize(text)
    # Converter para minúsculas
    words = [word.lower() for word in words]
    return words

# Função para verificar se a saída é uma lista de palavras
def is_list_of_words(text):
    words = extract_words(text)
    return all(word.isalpha() for word in words)
    
results_bode = []
results_keybert = []
results_vlt5 = []
results_google = []
results_bloom = []
results_face = []
results_merged = []
default_text = []

for text in df_final['Bode'].tolist():
    results_bode.append(list(set(extract_words(text))))

for text in df_final['keybert'].tolist():
    results_keybert.append(list(set(extract_words(text))))

for text in df_final['vlt5'].tolist():
    results_vlt5.append(list(set(extract_words(text))))

for text in df_final['Google'].tolist():
    results_google.append(list(set(extract_words(text))))

for text in df_final['BigScience'].tolist():
    results_bloom.append(list(set(extract_words(text))))

for text in df_final['Facebook'].tolist():
    results_face.append(list(set(extract_words(text))))

for text in df_final['merged'].tolist():
    results_merged.append(list(set(extract_words(text))))

for text in df_final['aspect'].tolist():
    default_text.append(list(set(extract_words(text))))

results_of_all = []
results_of_all.extend(df_final['aspect'].tolist()+df_final['merged'].tolist()+results_vlt5+results_google+results_bloom+results_face)

results_of_all = [item for sublist in results_of_all for item in sublist]
unique_elements_results = list(set(results_of_all))

resultados = {
    'merged': results_merged,
    'bode': results_bode,
    'vlt5': results_vlt5,
    'google': results_google,
    'bloom': results_bloom,
    'face': results_face,
    'keybert': results_keybert
}

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

from collections import Counter
import math

def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

import numpy as np
from sklearn.metrics import hamming_loss, f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# Criar um binarizador com todas as classes
#mlb = MultiLabelBinarizer(classes=unique_elements_results)
mlb = MultiLabelBinarizer()
#mlb.fit(unique_elements_results)
y_true_binarized = mlb.fit_transform(default_text)

columns = ['merged', 'bode', 'vlt5', 'google', 'bloom', 'face', 'keybert']

for column in columns:
    jaccard_results = []
    cosine_results = []
    print(f'Cálculo da coluna {column}:')
    # Binarizar as etiquetas
    y_pred_binarized = mlb.transform(resultados[column])

    # Calcular Hamming Loss
    #hl = hamming_loss(y_true_binarized, y_pred_binarized)
    #print(f'Hamming Loss: {hl}')

    for x in range(len(default_text)):
        jaccard_results.append(jaccard_similarity(default_text[x], resultados[column][x]))
        if resultados[column][x] == []:
            resultados[column][x].append('nan')
        counterA = Counter(default_text[x])
        counterB = Counter(resultados[column][x])
        cosine_results.append(counter_cosine_similarity(counterA, counterB))
        
    print(f'Jaccard similarity: {sum(jaccard_results)/len(jaccard_results)}')
    print(f'Cosine similarity: {sum(cosine_results)/len(cosine_results)}')
    
    print(classification_report(y_true_binarized, y_pred_binarized))
    print('----------------------------------------------')

Cálculo da coluna merged:
Jaccard similarity: 0.17964602926184337
Cosine similarity: 0.35225444132027744
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.00      0.00      0.00         1
           2       0.89      0.57      0.70        14
           3       1.00      1.00      1.00        38
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.48      0.71      0.57        17
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         4
          11       0.00      0.00      0.00        12
          12       0.60      1.00      0.75         9
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         5
          15       0.00      0