In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

In [2]:
# Load dataset
data = pd.read_csv('broadband_with_no_business_deals_for_2024-12-01_len_240.csv')
#mobile users, recommendation for the customers, good deal: 
# Benchmark section: same benefit with lower price, lower price and more benefit, higher price and more benefit, same price and more benefits (3 deals more)
# recommendation through all of the network, customer-based, driven by the price and benefits 
data.head()

Unnamed: 0,lp_deal_id,deal_id,name,download_speed,monthly_price,new_line_price,total_first_year_cost,full_contract_cost,min_contract_length,limit,supplier_id,supplier_name
0,1934770059986577994,30459,Be150,150.0,30.0,0,288.0,432.0,18,99999,255,BeFibre
1,4102140181543798600,33530,Be150 (12m),150.0,30.0,0,288.0,288.0,12,99999,255,BeFibre
2,9840303560243366198,30460,Be500,500.0,35.0,0,336.0,504.0,18,99999,255,BeFibre
3,4410473646066635655,30461,Be900,900.0,32.0,0,336.0,528.0,18,99999,255,BeFibre
4,4163365288842897212,33531,Be500 (12m),500.0,35.0,0,336.0,336.0,12,99999,255,BeFibre


In [3]:
def preprocess_data(data):

    data = data.fillna({'download_speed': 0, 'monthly_price': 0, 'min_contract_length': 0})
    data['download_speed'] = data['download_speed'].astype(float)
    data['monthly_price'] = data['monthly_price'].astype(float)
    data['min_contract_length'] = data['min_contract_length'].astype(int)

    # Derived Features
    data['speed_per_dollar'] = data['download_speed'] / data['monthly_price']
    data['speed_per_month'] = data['download_speed'] / data['min_contract_length']
    data['value_score'] = data['speed_per_dollar'] * data['speed_per_month']
    return data

In [4]:
data = preprocess_data(data)
data.head()

Unnamed: 0,lp_deal_id,deal_id,name,download_speed,monthly_price,new_line_price,total_first_year_cost,full_contract_cost,min_contract_length,limit,supplier_id,supplier_name,speed_per_dollar,speed_per_month,value_score
0,1934770059986577994,30459,Be150,150.0,30.0,0,288.0,432.0,18,99999,255,BeFibre,5.0,8.333333,41.666667
1,4102140181543798600,33530,Be150 (12m),150.0,30.0,0,288.0,288.0,12,99999,255,BeFibre,5.0,12.5,62.5
2,9840303560243366198,30460,Be500,500.0,35.0,0,336.0,504.0,18,99999,255,BeFibre,14.285714,27.777778,396.825397
3,4410473646066635655,30461,Be900,900.0,32.0,0,336.0,528.0,18,99999,255,BeFibre,28.125,50.0,1406.25
4,4163365288842897212,33531,Be500 (12m),500.0,35.0,0,336.0,336.0,12,99999,255,BeFibre,14.285714,41.666667,595.238095


In [5]:
def prepare_knn_model(data, features, k):
    X = data[features].values
    knn = NearestNeighbors(n_neighbors=k, metric='euclidean')
    knn.fit(X)
    return knn, X
    
def find_better_deals_by_id(data, knn, X, deal_id, features, k=3):
    """
    Find better deals for a given deal ID.
    """
    # Get the query point based on the deal ID
    query_row = data[data['deal_id'] == deal_id]
    if query_row.empty:
        print(f"Deal ID {deal_id} not found in the dataset.")
        return None

    query_point = query_row[features].values.reshape(1, -1)

    # Find the nearest neighbors
    distances, indices = knn.kneighbors(query_point)
    nearest_deals = data.iloc[indices[0]].copy()
    nearest_deals['distance'] = distances[0]

    # Exclude the deal itself and select better deals
    nearest_deals = nearest_deals[nearest_deals['deal_id'] != deal_id]

    # Apply better deal conditions
    nearest_deals['is_better'] = (
        (nearest_deals['monthly_price'] < query_row['monthly_price'].values[0]) &
        (nearest_deals['download_speed'] >= query_row['download_speed'].values[0]) |
        (nearest_deals['monthly_price'] <= query_row['monthly_price'].values[0]) &
        (nearest_deals['download_speed'] > query_row['download_speed'].values[0]) |
        (nearest_deals['monthly_price'] > query_row['monthly_price'].values[0]) &
        (nearest_deals['download_speed'] > query_row['download_speed'].values[0]) |
        (nearest_deals['monthly_price'] == query_row['monthly_price'].values[0]) &
        (nearest_deals['download_speed'] > query_row['download_speed'].values[0])
    )

    # Filter to show only better deals and limit to top k
    better_deals = nearest_deals[nearest_deals['is_better']].sort_values(by=['distance']).head(k)
    return better_deals

In [6]:
# Section 4: Main Pipeline
def main_knn_pipeline(file_path, deal_id, features, k=3):
    data = preprocess_data(load_data(file_path))
    knn, X = prepare_knn_model(data, features, k=k + 1)  # +1 to account for excluding the deal itself
    better_deals = find_better_deals_by_id(data, knn, X, deal_id, features, k)

    if better_deals is not None:
        print(f"Better deals for Deal ID {deal_id}:")
        print(better_deals[['deal_id', 'name', 'download_speed', 'monthly_price', 'distance', 'is_better']])
    return better_deals

In [7]:
features = ['download_speed', 'monthly_price', 'value_score']
knn, X = prepare_knn_model(data, features, k=5)
better_deals = find_better_deals_by_id(data, knn, X, deal_id=30459, features=features, k=3)
if better_deals is not None:
    print(better_deals)

             lp_deal_id  deal_id                      name  download_speed  \
73  7231569570318685899    28922  Full Fibre 150 CityFibre           152.0   
72  2708783181533761909    28906            Full Fibre 150           152.0   
69   825483088786176995    35226             Lit 100 - 24m           150.0   

    monthly_price  new_line_price  total_first_year_cost  full_contract_cost  \
73           29.0              60                  348.0               576.0   
72           33.0              60                  348.0               576.0   
69           28.0               0                  336.0               672.0   

    min_contract_length  limit  supplier_id supplier_name  speed_per_dollar  \
73                   18  99999          148      TalkTalk          5.241379   
72                   18  99999          148      TalkTalk          4.606061   
69                   24  99999          288     Lit Fibre          5.357143   

    speed_per_month  value_score  distance  is_be

In [8]:
# Section 4: Main Pipeline
def main_knn_pipeline(data, deal_id, features, k=3):
    """
    Main pipeline to find better deals for a given deal ID.
    """
    # Preprocess the data
    data = preprocess_data(data)
    
    # Prepare the KNN model
    knn, X = prepare_knn_model(data, features, k=k + 1)  # +1 to account for excluding the deal itself
    
    # Find better deals for the specified deal ID
    better_deals = find_better_deals_by_id(data, knn, X, deal_id, features, k)

    # Print and return the better deals
    if better_deals is not None:
        print(f"Better deals for Deal ID {deal_id}:")
        print(better_deals[['deal_id', 'name', 'download_speed', 'monthly_price', 'distance', 'is_better']])
    return better_deals

In [28]:
if __name__ == "__main__":
    features = ['download_speed', 'monthly_price', 'value_score']
    data = preprocess_data(data)

    deal_id = 30459  # Replace with the deal ID you want to check

    # Run the pipeline to find better deals
    better_deals = main_knn_pipeline(data, deal_id, features, k=3)

    # Print the results
    if better_deals is not None:
        print("Top 3 better deals:")
        print(better_deals[['deal_id', 'name', 'download_speed', 'monthly_price', 'distance', 'is_better']])


Better deals for Deal ID 30459:
    deal_id                      name  download_speed  monthly_price  \
73    28922  Full Fibre 150 CityFibre           152.0           29.0   
72    28906            Full Fibre 150           152.0           33.0   

    distance  is_better  
73  3.424640       True  
72  4.547382       True  
Top 3 better deals:
    deal_id                      name  download_speed  monthly_price  \
73    28922  Full Fibre 150 CityFibre           152.0           29.0   
72    28906            Full Fibre 150           152.0           33.0   

    distance  is_better  
73  3.424640       True  
72  4.547382       True  
