In [1]:
from gensim.models import Word2Vec
import numpy as np
from sklearn.decomposition import PCA

In [2]:
models = {
    "lines": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-lines_information-pfp-c.model",
    "points": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-points_information-pfp-c.model",
    "roads": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-roads_information-pfp-c.model",
    "polygons": "new-york-sl-tuple-geoc2vec-μ90-pois-distilw2v-polygons_information-pfp-c.model"
}

In [12]:
target_dim = 512

word_embeddings_pca = {}

for model_name, model_path in models.items():
    print(f"Processando modelo: {model_name}")

    #Carrega o modelo
    model = Word2Vec.load(model_path)

    #Exporta o vocabulário do modelo 
    words = list(model.wv.vocab.keys())
    word_vectors = np.array([model.wv[word] for word in words])
    n_samples, n_features = word_vectors.shape
    #n_components = min(target_dim, n_samples)
    n_components = target_dim
    
    print(f"Reduzindo de {n_features} para {n_components} dimensões...")

    #Faz o redimensionamento dos vetores do vocabulário
    pca = PCA(n_components=n_components)
    word_vectors_reduced = pca.fit_transform(word_vectors)
    
    #Treina e salva o modelo redimensionado
    new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)
    new_model.build_vocab([words])
    new_model.wv.vectors = word_vectors_reduced
    new_model.save(f"./PCA/word2vec_reduced_{model_name}_{target_dim}.model")
    print(f"{model_name}: Antes {word_vectors.shape}, Depois {word_vectors_reduced.shape} - Modelo salvo como word2vec_reduced_{model_name}_128.model")

print("Processamento concluído e modelos salvos!")

Processando modelo: lines
Reduzindo de 768 para 28 dimensões...
lines: Antes (436, 768), Depois (436, 28) - Modelo salvo como word2vec_reduced_lines_128.model
Processando modelo: points
Reduzindo de 768 para 28 dimensões...


  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)
  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)


points: Antes (575, 768), Depois (575, 28) - Modelo salvo como word2vec_reduced_points_128.model
Processando modelo: roads
Reduzindo de 768 para 28 dimensões...
roads: Antes (281, 768), Depois (281, 28) - Modelo salvo como word2vec_reduced_roads_128.model
Processando modelo: polygons
Reduzindo de 768 para 28 dimensões...


  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)
  new_model = Word2Vec(size=n_components, window=model.window, min_count=model.min_count)


polygons: Antes (655, 768), Depois (655, 28) - Modelo salvo como word2vec_reduced_polygons_128.model
Processamento concluído e modelos salvos!


In [13]:
model_lines = Word2Vec.load("ew-york-sl-tuple-geoc2vec-pois-distilw2v-lines_information-pfp-c.model")
model_lines_pca = Word2Vec.load("./PCA/word2vec_reduced_lines_32.model")
model_pca = Word2Vec.load("./PCA/word2vec_reduced_lines_28.model")

In [9]:
model_lines.wv.most_similar(positive=["Bookstore"])

[('Library', 0.9702208638191223),
 ('African Restaurant', 0.9685879349708557),
 ('Dim Sum Restaurant', 0.9685351848602295),
 ('Bakery', 0.9682315587997437),
 ('German Restaurant', 0.9680699706077576),
 ('Hotel', 0.9679540395736694),
 ('Restaurant', 0.9673723578453064),
 ('Theater', 0.9673452377319336),
 ('Clothing Store', 0.9669424891471863),
 ('Gift Shop', 0.9665777683258057)]

In [16]:
model_lines_pca.wv.most_similar(positive=["Hotel"])

[('Restaurant', 0.8666859269142151),
 ('Casino', 0.8360579013824463),
 ('Bar', 0.8077646493911743),
 ('Bookstore', 0.7884572744369507),
 ('Café', 0.7779343724250793),
 ('Parking', 0.7732722163200378),
 ('City', 0.7717400193214417),
 ('Park', 0.7676903009414673),
 ('Diner', 0.7650327682495117),
 ('Office', 0.7627938389778137)]

In [15]:
model_pca.wv.most_similar(positive=["Hotel"])

[('Restaurant', 0.8893362879753113),
 ('Bookstore', 0.8595757484436035),
 ('Bar', 0.845493495464325),
 ('Casino', 0.8381418585777283),
 ('Café', 0.8113797903060913),
 ('Diner', 0.8068528175354004),
 ('City', 0.8044070601463318),
 ('Neighborhood', 0.7953543663024902),
 ('German Restaurant', 0.7822244167327881),
 ('Office', 0.7767152190208435)]