In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import faiss
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
df = pd.read_csv("skincare_products.csv")

# Tokenize the text data
df['tokens'] = df['ingredients'].apply(word_tokenize)

# Train a Word2Vec model
model_w2v = Word2Vec(sentences=df['tokens'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Extract the word vectors
word_vectors = model_w2v.wv.get_normed_vectors()  # Normalized vectors for better cosine similarity calculation

# Apply PCA for dimensionality reduction
pca = PCA(n_components=50)  # Reduce dimensions to 50
reduced_vectors = pca.fit_transform(word_vectors)

# Create a FAISS index for L2 distance (change to IndexFlatIP for cosine similarity)
dim = reduced_vectors.shape[1]
index = faiss.IndexFlatL2(dim)  # Initialize the index
index.add(reduced_vectors.astype(np.float32))  # Add vectors to the index

# Example query - using the vector of the first ingredient
query_vector = reduced_vectors[0].reshape(1, dim)
D, I = index.search(query_vector, 5)  # Search for the 5 nearest neighbors

# Print the results
print("Distances (squared L2):", D)
print("Indices of nearest vectors:", I)

# Calculate cosine similarity matrix for demonstration (optional, might be computationally intensive)
similarity_matrix = cosine_similarity(reduced_vectors)

# Display similarity matrix
print("Cosine similarity matrix:\n", similarity_matrix)


ModuleNotFoundError: No module named 'gensim'

In [3]:
import gensim
print("Gensim version:", gensim.__version__)

ModuleNotFoundError: No module named 'gensim'

In [4]:
import sys
print("Python version:", sys.version)
print("Python executable:", sys.executable)


Python version: 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]
Python executable: c:\Users\goern_y\anaconda3\python.exe
