In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [2]:
# Load dataset
df = pd.read_csv("products.csv")

In [3]:
# Parse JSON specifications into a structured DataFrame
def parse_specs(spec_str):
    try:
        return json.loads(spec_str.replace("'", '"'))
    except:
        return {}

In [4]:
df['specs'] = df['specifications'].apply(parse_specs)
specs_df = pd.json_normalize(df['specs'])
df = pd.concat([df, specs_df], axis=1)

In [5]:
df.head()

Unnamed: 0,product_id,product_name,category,specifications,specs,voltage_rating,width,color,temperature_range,voltage,...,cordless,max_wattage,smart_home_compatible,wattage,base_type,lumens,color_temp,max_torque,battery_type,weight
0,P1001,3M 2.5mm Electrical Tape,Electrical Supplies,"{'voltage_rating': '600V', 'width': '19mm', 'c...","{'voltage_rating': '600V', 'width': '19mm', 'c...",600V,19mm,Black,-18C to 105C,,...,,,,,,,,,,
1,P1002,Hubbell 20A Duplex Receptacle,Electrical Supplies,"{'voltage': '120V', 'current': '20A', 'color':...","{'voltage': '120V', 'current': '20A', 'color':...",,,White,,120V,...,,,,,,,,,,
2,P1003,Southwire 12/2 NM-B Cable,Electrical Supplies,"{'gauge': '12 AWG', 'conductor_material': 'Cop...","{'gauge': '12 AWG', 'conductor_material': 'Cop...",600V,,,,,...,,,,,,,,,,
3,P1004,Leviton Decora Smart Switch,Electrical Supplies,"{'voltage': '120V', 'wifi': 'Yes', 'load_type'...","{'voltage': '120V', 'wifi': 'Yes', 'load_type'...",,,,,120V,...,,,,,,,,,,
4,P1005,Klein Tools Diagonal Cutters,Tools,"{'length': '7 inch', 'material': 'High-carbon ...","{'length': '7 inch', 'material': 'High-carbon ...",,,,,,...,,,,,,,,,,


## Handlish missing values & Normalize

In [7]:
# Fill missing numerical specs with median
num_cols = specs_df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

In [8]:
# Fill missing categorical specs with mode
cat_cols = specs_df.select_dtypes(exclude=np.number).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

In [9]:
# Normalize numerical features
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

ValueError: at least one array or dtype is required

## Featue Encoding

In [None]:
# One-hot encode categorical features
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df[cat_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(cat_cols))

In [None]:
# Combine all features
final_features = pd.concat([df[num_cols], encoded_cats_df], axis=1)

## Product Matching Model - Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(final_features)

In [None]:
# Convert to DataFrame for easy lookup
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=df['product_id'],
    columns=df['product_id']
)

## Implement Similar Product Finder

In [None]:
def find_similar_products(product_id, n=3):
    """Returns top n most similar products based on specs."""
    similar_products = (
        similarity_df[product_id]
        .sort_values(ascending=False)
        .iloc[1:n+1]  # Skip self-match
        .index
        .tolist()
    )
    return similar_products

## Recommendation System

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Combine product name and specs into a text feature
df['text_features'] = df['product_name'] + " " + df['specifications']

In [None]:
# Vectorize text
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text_features'])

In [None]:
# Compute similarity
text_similarity = cosine_similarity(tfidf_matrix)

def recommend_alternatives(product_id, n=3):
    """Recommends alternatives using product descriptions."""
    idx = df.index[df['product_id'] == product_id].tolist()[0]
    sim_scores = list(enumerate(text_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # Skip self-match
    similar_products = [df.iloc[i[0]]['product_id'] for i in sim_scores]
    return similar_products

## Hybrid Approaches (Specks + Categories)

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
# Combine numerical specs and encoded categories
X = final_features.values

In [None]:
# Fit KNN model
knn = NearestNeighbors(n_neighbors=4, metric='cosine')
knn.fit(X)

In [None]:
def recommend_alternatives(product_id, n=3):
    """Recommends alternatives using KNN on feature space."""
    idx = df.index[df['product_id'] == product_id].tolist()[0]
    distances, indices = knn.kneighbors([X[idx]])
    similar_products = df.iloc[indices[0][1:n+1]]['product_id'].tolist()
    return similar_products