In [18]:
import pandas as pd

df_final = pd.read_csv('Results_b2w.csv')

import nltk
import re

def extract_words(text):
    # Remover pontuação
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenizar
    words = nltk.word_tokenize(text)
    # Converter para minúsculas
    words = [word.lower() for word in words]
    return words

# Função para verificar se a saída é uma lista de palavras
def is_list_of_words(text):
    words = extract_words(text)
    return all(word.isalpha() for word in words)
    
results_bode = []
results_keybert = []
results_vlt5 = []
results_google = []
results_bloom = []
results_face = []
results_merged = []
default_text = []

for text in df_final['Bode'].tolist():
    results_bode.append(list(set(extract_words(text))))

for text in df_final['keybert'].tolist():
    results_keybert.append(list(set(extract_words(str(text)))))

for text in df_final['vlt5'].tolist():
    results_vlt5.append(list(set(extract_words(text))))

for text in df_final['Google'].tolist():
    results_google.append(list(set(extract_words(text))))

for text in df_final['BigScience'].tolist():
    results_bloom.append(list(set(extract_words(str(text)))))

for text in df_final['Facebook'].tolist():
    results_face.append(list(set(extract_words(text))))

for text in df_final['merged'].tolist():
    results_merged.append(list(set(extract_words(text))))

for text in df_final['product_name'].tolist():
    default_text.append(list(set(extract_words(text))))

resultados = {
    'merged': results_merged,
    'bode': results_bode,
    'vlt5': results_vlt5,
    'google': results_google,
    'bloom': results_bloom,
    'face': results_face,
    'keybert': results_keybert
}

def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0

from collections import Counter
import math

def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [19]:
columns = ['merged', 'bode', 'vlt5', 'google', 'bloom', 'face', 'keybert']

for column in columns:
    jaccard_results = []
    cosine_results = []
    print(f'Cálculo da coluna {column}:')
    
    for x in range(len(default_text)):
        jaccard_results.append(jaccard_similarity(default_text[x], resultados[column][x]))
        if resultados[column][x] == []:
            resultados[column][x].append('nan')
        counterA = Counter(default_text[x])
        counterB = Counter(resultados[column][x])
        cosine_results.append(counter_cosine_similarity(counterA, counterB))
        
    print(f'Jaccard similarity: {sum(jaccard_results)/len(jaccard_results)}')
    print(f'Cosine similarity: {sum(cosine_results)/len(cosine_results)}')
    print('---------------------------------')

Cálculo da coluna merged:
Jaccard similarity: 0.3814734430746512
Cosine similarity: 0.547244476991444
---------------------------------
Cálculo da coluna bode:
Jaccard similarity: 0.6992147269312387
Cosine similarity: 0.8148936026036624
---------------------------------
Cálculo da coluna vlt5:
Jaccard similarity: 0.4926394570409176
Cosine similarity: 0.6534910082418111
---------------------------------
Cálculo da coluna google:
Jaccard similarity: 0.30446125736186286
Cosine similarity: 0.49653704772664925
---------------------------------
Cálculo da coluna bloom:
Jaccard similarity: 0.5158924233379318
Cosine similarity: 0.6238169082939823
---------------------------------
Cálculo da coluna face:
Jaccard similarity: 0.015300393475320804
Cosine similarity: 0.030239538787426752
---------------------------------
Cálculo da coluna keybert:
Jaccard similarity: 0.3656985288073825
Cosine similarity: 0.5585302924615859
---------------------------------
