In [6]:
import gensim.downloader as api
from scipy.spatial.distance import cosine
from gensim.utils import simple_preprocess
import numpy as np

In [4]:
model = api.load("glove-wiki-gigaword-50")

def sentence_vector(sentence, model):
    words = [word for word in sentence.lower().split() if word in model]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean([model[word] for word in words], axis=0)

# Frases
s0 = "Mark zuckerberg owns the facebook company"
s1 = "Facebook company ceo is mark zuckerberg"
s2 = "Microsoft is owned by Bill gates"
s3 = "How to learn japanese"

# Calculando os vetores médios para cada frase
vec_s0 = sentence_vector(s0, model)
vec_s1 = sentence_vector(s1, model)
vec_s2 = sentence_vector(s2, model)
vec_s3 = sentence_vector(s3, model)

# Calculando a similaridade usando a distância cosseno
similar_s0_s1 = 1 - cosine(vec_s0, vec_s1)
similar_s0_s2 = 1 - cosine(vec_s0, vec_s2)
similar_s0_s3 = 1 - cosine(vec_s0, vec_s3)
similar_s1_s2 = 1 - cosine(vec_s1, vec_s2)
similar_s1_s3 = 1 - cosine(vec_s1, vec_s3)
similar_s2_s3 = 1 - cosine(vec_s2, vec_s3)

print(f"Similaridade entre s0 e s1: {similar_s0_s1:.4f}")
print(f"Similaridade entre s0 e s2: {similar_s0_s2:.4f}")
print(f"Similaridade entre s0 e s3: {similar_s0_s3:.4f}")
print(f"Similaridade entre s1 e s2: {similar_s1_s2:.4f}")
print(f"Similaridade entre s1 e s3: {similar_s1_s3:.4f}")
print(f"Similaridade entre s2 e s3: {similar_s2_s3:.4f}")

Similaridade entre s0 e s1: 0.9659
Similaridade entre s0 e s2: 0.8659
Similaridade entre s0 e s3: 0.5878
Similaridade entre s1 e s2: 0.8596
Similaridade entre s1 e s3: 0.5718
Similaridade entre s2 e s3: 0.7226


In [7]:
# Banco de dados
frases = [

"Mark Zuckerberg is the founder of Facebook.",

"Facebook is one of the largest social media platforms.",

"Bill Gates co-founded Microsoft.",

"Microsoft develops software products like Windows and Office.",

"Steve Jobs was the co-founder of Apple.",

"Apple is known for its iPhones and Mac computers.",

"Elon Musk is the CEO of SpaceX and Tesla.",

"Tesla is a leader in electric vehicles.",

"Google is a major player in the tech industry.",

"Amazon is a giant in e-commerce and cloud computing."

]

# Cálculo da similaridade média entre uma palavra e uma frase
def frase_similaridade(frase, palavra):
    palavras_frase = simple_preprocess(frase)
    similars = []
    for w in palavras_frase:
        if w in model:
            similars.append(1 - cosine(model[palavra], model[w]))
    return np.mean(similars) if similars else 0

# Obtendo as 5 frases mais similares
def buscar_frases(word):
    similaridades = [(frase, frase_similaridade(frase, word)) for frase in frases]
    similaridades = sorted(similaridades, key=lambda x: x[1], reverse=True)
    return similaridades[:5]

# Buscando as frases mais semelhantes
resultados = buscar_frases("apple")
for frase, similaridade in resultados:
    print(f"Frase: {frase} | Similaridade: {similaridade:.4f}")

Frase: Apple is known for its iPhones and Mac computers. | Similaridade: 0.5630
Frase: Microsoft develops software products like Windows and Office. | Similaridade: 0.5534
Frase: Amazon is a giant in e-commerce and cloud computing. | Similaridade: 0.4552
Frase: Steve Jobs was the co-founder of Apple. | Similaridade: 0.4524
Frase: Google is a major player in the tech industry. | Similaridade: 0.4374
