# Nearest Neighbour Search

In [3]:
import pandas as pd
import numpy as np
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

df_sales = pd.read_excel("../DATA CURRENT/SALES.xlsx")
df_parameters = pd.read_excel("../DATA CURRENT/L1-PARAMETERS.xlsx")

# Merge df_parameters into df
df = df_parameters.copy()

# Define the columns to calculate stats for
columns_to_calculate = ['Price', 'Quantity']

# Create empty lists to store the results
item_codes = []
quoted_price_min = []
quoted_price_max = []
quoted_price_avg = []
quoted_quantity_min = []
quoted_quantity_max = []
quoted_quantity_avg = []
ordered_price_min = []
ordered_price_max = []
ordered_price_avg = []
ordered_quantity_min = []
ordered_quantity_max = []
ordered_quantity_avg = []
average_price = []

# Loop through each Item Code in df_parameters
for item_code in df_parameters['Item Code']:
    # Filter df_sales for the current item code
    df_item_sales = df_sales[df_sales['Item Code'] == item_code]
    
    # Quoted prices and quantities
    df_quoted = df_item_sales[df_item_sales['Status'] == 'QUOTATO']
    quoted_price_min.append(df_quoted['Price'].min())
    quoted_price_max.append(df_quoted['Price'].max())
    quoted_price_avg.append(df_quoted['Price'].mean())
    quoted_quantity_min.append(df_quoted['Quantity'].min())
    quoted_quantity_max.append(df_quoted['Quantity'].max())
    quoted_quantity_avg.append(df_quoted['Quantity'].mean())
    
    # Ordered prices and quantities
    df_ordered = df_item_sales[df_item_sales['Status'] == 'ORDINATO']
    ordered_price_min.append(df_ordered['Price'].min())
    ordered_price_max.append(df_ordered['Price'].max())
    ordered_price_avg.append(df_ordered['Price'].mean())
    ordered_quantity_min.append(df_ordered['Quantity'].min())
    ordered_quantity_max.append(df_ordered['Quantity'].max())
    ordered_quantity_avg.append(df_ordered['Quantity'].mean())
    
    # Average price across both statuses
    avg_price = df_item_sales['Price'].mean(skipna=True)
    average_price.append(avg_price)

# Add the calculated stats to df
df['Quoted Price Min'] = quoted_price_min
df['Quoted Price Max'] = quoted_price_max
df['Quoted Price Avg'] = quoted_price_avg
df['Quoted Quantity Min'] = quoted_quantity_min
df['Quoted Quantity Max'] = quoted_quantity_max
df['Quoted Quantity Avg'] = quoted_quantity_avg
df['Ordered Price Min'] = ordered_price_min
df['Ordered Price Max'] = ordered_price_max
df['Ordered Price Avg'] = ordered_price_avg
df['Ordered Quantity Min'] = ordered_quantity_min
df['Ordered Quantity Max'] = ordered_quantity_max
df['Ordered Quantity Avg'] = ordered_quantity_avg
df['Average Price'] = average_price

# Save the resulting df to an Excel file
df.to_excel('L1 - MASTER_DATA.xlsx', index=False)

# Define parameters and target columns
parameters = ['Layout', 'Sensing Element', 'Case Material', 'Cable Material', 'Cable Length', 'Terminal']
target_price = 'Average Price'
target_columns = [
    'Item Code', 'Quoted Price Min', 'Quoted Price Max', 'Quoted Price Avg', 
    'Quoted Quantity Min', 'Quoted Quantity Max', 'Quoted Quantity Avg',
    'Ordered Price Min', 'Ordered Price Max', 'Ordered Price Avg',
    'Ordered Quantity Min', 'Ordered Quantity Max', 'Ordered Quantity Avg'
]

# Preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Cable Length']),
        ('cat', OneHotEncoder(), ['Layout', 'Sensing Element', 'Case Material', 'Cable Material', 'Terminal'])
    ]
)

# Generate interaction terms
poly = PolynomialFeatures(interaction_only=True, include_bias=False)

# Create pipeline for preprocessing, polynomial features, and regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

X = df[parameters]
y_price = df[target_price]

# Train the model for average_price
pipeline.fit(X, y_price)
feature_importance_price = pipeline.named_steps['regressor'].feature_importances_

# Get the feature names after preprocessing and polynomial features
one_hot_features = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(
    ['Layout', 'Sensing Element', 'Case Material', 'Cable Material', 'Terminal'])
feature_names = np.append(['Cable Length'], one_hot_features)
poly_features = poly.get_feature_names_out(feature_names)

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'Feature': poly_features,
    'Importance': feature_importance_price
})

# Normalize the importance to get weights
feature_importance_df['Weight'] = feature_importance_df['Importance'] / feature_importance_df['Importance'].sum()

# Save as excel because too long
feature_importance_df.to_excel('L1_feature_importance.xlsx', index=False)

# Use the weights for the nearest neighbor model
weights = feature_importance_df.set_index('Feature')['Weight'].to_dict()

# Prepare the data for Nearest Neighbors using the calculated weights
X_preprocessed = preprocessor.fit_transform(X).toarray()  # Convert sparse matrix to dense
X_poly = poly.fit_transform(X_preprocessed)

# Apply weights to the preprocessed features
weighted_features = np.array([weights[feature] for feature in poly_features])
X_preprocessed_weighted = X_poly * weighted_features

# Fit the NearestNeighbors model
nbrs = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(X_preprocessed_weighted)

# Function to get user input
def get_user_input(df):
    user_input = {}
    for param in parameters:
        if param == 'Cable Length':
            user_input[param] = float(input(f"Enter {param}: "))
        else:
            unique_values = df[param].unique()
            print(f"Select {param} from the following options: {', '.join(unique_values)}")
            user_input[param] = input(f"Enter {param}: ")
    return user_input

# Get user input
user_input = get_user_input(df)

# Convert user input to DataFrame
input_df = pd.DataFrame([user_input])

# Check if the identical product exists in the dataset
identical_product = df[
    (df['Layout'] == user_input['Layout']) &
    (df['Sensing Element'] == user_input['Sensing Element']) &
    (df['Case Material'] == user_input['Case Material']) &
    (df['Cable Material'] == user_input['Cable Material']) &
    (df['Cable Length'] == user_input['Cable Length']) &
    (df['Terminal'] == user_input['Terminal'])
]

if not identical_product.empty:
    print("Identical product found:")
    print(identical_product[target_columns])
else:
    # Preprocess user input
    input_preprocessed = preprocessor.transform(input_df).toarray()  # Convert sparse matrix to dense
    input_poly = poly.transform(input_preprocessed)
    input_preprocessed_weighted = input_poly * weighted_features

    # Find the nearest neighbors
    distances, indices = nbrs.kneighbors(input_preprocessed_weighted)

    # Normalize distances to compute similarity scores
    max_distance = np.max(distances)
    similarity_scores = 1 - (distances / max_distance)

    # Get the closest 5 products
    closest_products = df.iloc[indices[0]][target_columns].copy()
    closest_products['Similarity Score'] = similarity_scores[0]

    # Display the results
    print("Closest 5 products with similarity scores:")
    print(closest_products)