In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [11]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # checks if GPU is accessible



2.9.0+cpu
False


In [12]:
os.chdir(r"C:\Users\HP\furniture-recommender")



In [15]:
import pandas as pd

df = pd.read_csv(r"C:\Users\HP\furniture-recommender\data\furniture_data_clean.csv")
print(f"Loaded {len(df)} rows")
df.head()


Loaded 312 rows


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,53.99,"['Home & Kitchen', 'Furniture', 'Dining Room F...",['https://m.media-amazon.com/images/I/31SejUEW...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",Unknown,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e
2,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,Plant Repotting Mat MUYETOL Waterproof Transpl...,5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",Unknown,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8
3,"Pickleball Doormat, Welcome Doormat Absorbent ...",VEWETOL,The decorative doormat features a subtle textu...,13.99,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/61vz1Igl...,Contrence,"24""L x 16""W",Unknown,Rubber,A5589,8fd9377b-cfa6-5f10-835c-6b8eca2816b5
4,JOIN IRON Foldable TV Trays for Eating Set of ...,JOIN IRON Store,Set of Four Folding Trays With Matching Storag...,89.99,"['Home & Kitchen', 'Furniture', 'Game & Recrea...",['https://m.media-amazon.com/images/I/41p4d4VJ...,,"18.9""D x 14.2""W x 26""H",Unknown,Iron,Grey Set of 4,bdc9aa30-9439-50dc-8e89-213ea211d66a


In [16]:
# Cell 3: Create combined text features
# Combining multiple text fields gives better recommendations
# We weight title and description more heavily than other fields
def create_text_features(row):
    """
    Combine multiple product features into a single text representation.
    This helps the model understand product similarity better.
    """
    features = []
    
    # Add title (most important)
    if pd.notna(row['title']):
        features.append(str(row['title']) * 2)  # Repeat for emphasis
    
    # Add description
    if pd.notna(row['description']):
        features.append(str(row['description']))
    
    # Add category (important for grouping)
    if pd.notna(row['categories']):
        features.append(str(row['categories']) * 2)
    
    # Add other features
    for col in ['brand', 'material', 'color']:
        if pd.notna(row[col]):
            features.append(str(row[col]))
    
    return ' '.join(features)

df['combined_features'] = df.apply(create_text_features, axis=1)
print("Sample combined features:")
print(df['combined_features'].iloc[0][:200] + "...")



Sample combined features:
GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, HallwayGOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Dou...


In [17]:
# Cell 4: Create TF-IDF embeddings
# TF-IDF helps identify important words for each product
print("\nCreating TF-IDF embeddings...")
tfidf = TfidfVectorizer(
    max_features=500,  # Use top 500 features
    stop_words='english',  # Remove common English words
    ngram_range=(1, 2)  # Use single words and word pairs
)

tfidf_matrix = tfidf.fit_transform(df['combined_features'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Save TF-IDF model
os.makedirs('../backend/models', exist_ok=True)
with open('../backend/models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
print("TF-IDF vectorizer saved")


Creating TF-IDF embeddings...
TF-IDF matrix shape: (312, 500)
TF-IDF vectorizer saved


In [18]:
# Cell 5: Test recommendation system
# Let's verify our recommendations make sense
def get_recommendations_tfidf(product_idx, top_n=5):
    """
    Get similar products using TF-IDF and cosine similarity.
    This is a simple but effective recommendation approach.
    """
    # Calculate similarity between target product and all others
    cosine_sim = cosine_similarity(tfidf_matrix[product_idx], tfidf_matrix).flatten()
    
    # Get indices of most similar products (excluding the product itself)
    similar_indices = cosine_sim.argsort()[-top_n-1:-1][::-1]
    
    return similar_indices, cosine_sim[similar_indices]

# Test with first product
test_idx = 0
print(f"\nTest product: {df.iloc[test_idx]['title']}")
print(f"Category: {df.iloc[test_idx]['categories']}")
print(f"Price: ${df.iloc[test_idx]['price']:.2f}")
print("\nRecommendations:")

rec_indices, similarities = get_recommendations_tfidf(test_idx, top_n=5)
for idx, sim in zip(rec_indices, similarities):
    print(f"  - {df.iloc[idx]['title'][:50]}... (Similarity: {sim:.3f})")


Test product: GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway
Category: ['Home & Kitchen', 'Storage & Organization', 'Clothing & Closet Storage', 'Shoe Organizers', 'Free Standing Shoe Racks']
Price: $24.99

Recommendations:
  - GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Me... (Similarity: 1.000)
  - Dscabomlg Foldable Shoe Storage Plastic Vertical S... (Similarity: 0.715)
  - LANTEFUL Shoe Rack Organizer Shoe Storage Cabinet ... (Similarity: 0.700)
  - Honey-Can-Do 3-Tier Nesting Bamboo Shoe Rack SHO-0... (Similarity: 0.668)
  - sogesfurniture 5 Tier Free Standing Wooden Shoe St... (Similarity: 0.645)


In [19]:
# Cell 6: Create sentence embeddings using SentenceTransformer
# These embeddings capture semantic meaning better than TF-IDF
print("\nCreating sentence embeddings...")
print("Loading SentenceTransformer model (this may take a minute)...")

# Use a lightweight model for speed
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for all products
embeddings = model.encode(
    df['combined_features'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

print(f"Embeddings shape: {embeddings.shape}")

# Save embeddings
np.save('../backend/models/sentence_embeddings.npy', embeddings)
print("Sentence embeddings saved")


Creating sentence embeddings...
Loading SentenceTransformer model (this may take a minute)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Embeddings shape: (312, 384)
Sentence embeddings saved


In [20]:
# Cell 7: Test semantic search
def get_recommendations_semantic(product_idx, embeddings, top_n=5):
    """
    Get similar products using sentence embeddings.
    This captures semantic similarity better than keyword matching.
    """
    # Calculate cosine similarity
    similarities = cosine_similarity([embeddings[product_idx]], embeddings).flatten()
    
    # Get top similar products
    similar_indices = similarities.argsort()[-top_n-1:-1][::-1]
    
    return similar_indices, similarities[similar_indices]

# Test semantic search
print(f"\nSemantic search for: {df.iloc[test_idx]['title']}")
rec_indices, similarities = get_recommendations_semantic(test_idx, embeddings, top_n=5)
for idx, sim in zip(rec_indices, similarities):
    print(f"  - {df.iloc[idx]['title'][:50]}... (Similarity: {sim:.3f})")



Semantic search for: GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Metal Shoe Cap Rack With 8 Double Hooks For Living Room, Bathroom, Hallway
  - GOYMFK 1pc Free Standing Shoe Rack, Multi-layer Me... (Similarity: 1.000)
  - LANTEFUL Shoe Rack Organizer Shoe Storage Cabinet ... (Similarity: 0.817)
  - sogesfurniture 5 Tier Free Standing Wooden Shoe St... (Similarity: 0.749)
  - Soerreo Shoe Slot Storage Box Adjustable Shoe Rack... (Similarity: 0.737)
  - Dscabomlg Foldable Shoe Storage Plastic Vertical S... (Similarity: 0.728)


In [21]:
# Cell 8: Create category classifier
# Simple category classification based on text features
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['categories'])

# Save label encoder
with open('../backend/models/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)
print("Label encoder saved")


Label encoder saved


In [22]:
# Cell 9: Save product data for backend
# Save a subset of data needed for the API
product_data = df[[
    'uniq_id', 'title', 'brand', 'description', 
    'price', 'categories', 'images', 'material', 'color'
]].to_dict('records')

with open('../backend/models/product_data.pkl', 'wb') as f:
    pickle.dump(product_data, f)
print(f"Saved {len(product_data)} products for backend")

Saved 312 products for backend


In [23]:
# Cell 10: Performance summary
print("\n" + "="*50)
print("MODEL TRAINING COMPLETE")
print("="*50)
print(f"Total products: {len(df)}")
print(f"TF-IDF features: {tfidf_matrix.shape[1]}")
print(f"Embedding dimensions: {embeddings.shape[1]}")
print(f"Unique categories: {df['categories'].nunique()}")
print("\nSaved files:")
print("  - ../backend/models/tfidf_vectorizer.pkl")
print("  - ../backend/models/sentence_embeddings.npy")
print("  - ../backend/models/label_encoder.pkl")
print("  - ../backend/models/product_data.pkl")


MODEL TRAINING COMPLETE
Total products: 312
TF-IDF features: 500
Embedding dimensions: 384
Unique categories: 87

Saved files:
  - ../backend/models/tfidf_vectorizer.pkl
  - ../backend/models/sentence_embeddings.npy
  - ../backend/models/label_encoder.pkl
  - ../backend/models/product_data.pkl
