In [3]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Assuming df is your original DataFrame read from JSON
df = pd.read_json("ducks.json")

# Select relevant columns
df_selected = df[['duckDetails', 'additionalFeatures', 'productName']].copy()

# Extract specific values from nested dictionaries
df_selected['price'] = df_selected['duckDetails'].apply(lambda x: x['price'])
df_selected['style'] = df_selected['duckDetails'].apply(lambda x: x['style'])
df_selected['speed'] = df_selected['duckDetails'].apply(lambda x: x['speed'])
df_selected['size'] = df_selected['duckDetails'].apply(lambda x: x['size'])
df_selected['condition'] = df_selected['duckDetails'].apply(lambda x: x['condition'])
df_selected['buoyancy'] = df_selected['additionalFeatures'].apply(lambda x: x['buoyancy'])

# Drop columns that are no longer needed
df_selected.drop(['duckDetails', 'additionalFeatures', 'productName'], axis=1, inplace=True)

# Apply one-hot encoding to categorical variables
df_encoded = pd.get_dummies(df_selected, columns=['speed', 'style', 'size', 'condition', 'buoyancy'])

knn = NearestNeighbors(n_neighbors=3).fit(df_encoded)

In [7]:
selected_product_features = df_encoded.iloc[[1]]
print(selected_product_features)
selected_product_index = 1
distances, indices = knn.kneighbors(selected_product_features)
recommended_product_indices = indices.flatten()
recommended_products = df_selected.iloc[recommended_product_indices]

print("Selected Product:")
print(df_selected.iloc[selected_product_index])
print("\nRecommended Products:")
print(recommended_products)

   price  speed_average  speed_fast  speed_mystery  speed_slow  style_animal  \
1      5          False       False          False        True         False   

   style_classic  style_food  style_pirate  style_sports  size_large  \
1          False        True         False         False        True   

   size_medium  size_small  condition_new  condition_used  buoyancy_False  \
1        False       False           True           False            True   

   buoyancy_True  
1          False  
Selected Product:
price            5
style         food
speed         slow
size         large
condition      new
buoyancy     False
Name: 1, dtype: object

Recommended Products:
     price   style speed    size condition  buoyancy
1        5    food  slow   large       new     False
69       5  pirate  slow   large       new     False
145      5    food  slow  medium       new     False


In [8]:
import pickle

with open('knn_model.pkl', 'wb') as f:
    pickle.dump(knn, f)

print(f"Model saved to pickle")

Model saved to pickle


In [9]:
df_encoded.to_parquet('df_encoded.parquet', index=False)
