In [1]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.decomposition import PCA

In [2]:
models = {
    "lines": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-lines_information-pfp-c.model",
    "points": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-points_information-pfp-c.model",
    "roads": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-roads_information-pfp-c.model",
    "polygons": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-polygons_information-pfp-c.model"
}

In [13]:
target_dim = 35

word_embeddings_pca = {}

for model_name, model_path in models.items():
    print(f"Processando modelo: {model_name}")

    #Carrega o modelo
    model = Word2Vec.load(model_path)

    #Exporta o vocabulário do modelo 
    words = list(model.wv.vocab.keys())
    word_vectors = np.array([model.wv[word] for word in words])
    n_samples, n_features = word_vectors.shape
    #n_components = min(target_dim, n_samples)
    n_components = target_dim
    
    print(f"Reduzindo de {n_features} para {n_components} dimensões...")

    #Faz o redimensionamento dos vetores do vocabulário
    pca = PCA(n_components=n_components)
    word_vectors_reduced = pca.fit_transform(word_vectors)
    
    #Treina e salva o modelo redimensionado
    new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)
    new_model.build_vocab([words])
    new_model.wv.vectors = word_vectors_reduced
    new_model.save(f"./PCA/bert_reduced_{model_name}_{target_dim}.model")
    print(f"{model_name}: Antes {word_vectors.shape}, Depois {word_vectors_reduced.shape} - Modelo salvo como word2vec_reduced_{model_name}_128.model")

print("Processamento concluído e modelos salvos!")

Processando modelo: lines
Reduzindo de 768 para 35 dimensões...
lines: Antes (436, 768), Depois (436, 35) - Modelo salvo como word2vec_reduced_lines_128.model
Processando modelo: points
Reduzindo de 768 para 35 dimensões...


  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)
  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)


points: Antes (575, 768), Depois (575, 35) - Modelo salvo como word2vec_reduced_points_128.model
Processando modelo: roads
Reduzindo de 768 para 35 dimensões...
roads: Antes (281, 768), Depois (281, 35) - Modelo salvo como word2vec_reduced_roads_128.model
Processando modelo: polygons
Reduzindo de 768 para 35 dimensões...


  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)
  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)


polygons: Antes (655, 768), Depois (655, 35) - Modelo salvo como word2vec_reduced_polygons_128.model
Processamento concluído e modelos salvos!


In [14]:
model_lines = Word2Vec.load("new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-points_information-pfp-c.model")
model_lines_pca = Word2Vec.load("./PCA/bert_reduced_lines_256.model")
model_pca = Word2Vec.load("./PCA/bert_reduced_lines_35.model")

In [7]:
model_lines.wv.most_similar(positive=["Bookstore"])

[('Café', 0.9718804359436035),
 ('Bakery', 0.9702622294425964),
 ('Restaurant', 0.9688897728919983),
 ('Hotel', 0.9677895307540894),
 ('Factory', 0.9623737335205078),
 ('Library', 0.962039589881897),
 ('Diner', 0.9619094133377075),
 ('Brewery', 0.9616758823394775),
 ('Mall', 0.9587855935096741),
 ('Office', 0.9583530426025391)]

In [8]:
model_lines_pca.wv.most_similar(positive=["Bookstore"])

[('Bakery', 0.8067384362220764),
 ('Restaurant', 0.7936170697212219),
 ('Café', 0.7855138182640076),
 ('Mall', 0.7708073854446411),
 ('Library', 0.7672553062438965),
 ('Hotel', 0.7636808156967163),
 ('Playground', 0.7498050928115845),
 ('Church', 0.7472808361053467),
 ('Coffee Shop', 0.7345585227012634),
 ('Synagogue', 0.7336888313293457)]

In [15]:
model_pca.wv.most_similar(positive=["Bookstore"])

[('Bakery', 0.9078022837638855),
 ('Hotel', 0.8913036584854126),
 ('Library', 0.8882433176040649),
 ('Restaurant', 0.8853959441184998),
 ('Café', 0.8729249238967896),
 ('Church', 0.8639187812805176),
 ('Mall', 0.8611268401145935),
 ('Synagogue', 0.8574628829956055),
 ('Playground', 0.8550222516059875),
 ('Casino', 0.8506343364715576)]