In [21]:
sample_percentage = 1.00  # % of the data
beta = 1.1  # Distance decay parameter
customer_value = 300  # Household dental spending per year


In [22]:
import pandas as pd
import pickle


In [None]:
import math

def ratings_to_size(rating):
    return (math.sqrt(rating) + 20) / 10

In [23]:

# Load the property coordinates from the Parquet file
property_data = pd.read_parquet('data/nz_property_lat_lon.parquet')
property_data = property_data.drop(columns=['WKT'])

# Load the deduplicated dentists
with open('data/deduplicated_dentists.pkl', 'rb') as f:
    deduplicated_dentists = pickle.load(f)

# Convert deduplicated_dentists to DataFrame for easier manipulation
dentists_df = pd.DataFrame(deduplicated_dentists)
dentists_df['size'] = dentists_df['reviews'].apply(ratings_to_size)

# Display a few rows to verify
display(property_data.head())
display(dentists_df.head())


                                          WKT   longitude   latitude
0  POINT (172.682427990989 -43.5691006197534)  172.682428 -43.569101
1   POINT (174.76605808985 -36.8505358230548)  174.766058 -36.850536
2  POINT (174.766289274903 -36.8480077210557)  174.766289 -36.848008
3   POINT (174.77064284915 -36.8474174525228)  174.770643 -36.847417
4  POINT (174.753390890909 -36.8601424547084)  174.753391 -36.860142
                      place_id                    name        lat         lon  \
0  ChIJn2iTgIGqEm0R3tcUFgH1VDY             Mint Dental -37.191588  174.903652   
1  ChIJZ1N2dWKqEm0RZV_x3mljNxU  Pukekohe Orthodontists -37.074346  174.922603   
2  ChIJPx17MdWscm0RaYxnKNKfAcg   Vanessa Wright Dental -37.062908  174.940487   
3  ChIJiQWVRdSscm0RgB5b7NnhmNM         The Denture Man -37.064653  174.943538   
4  ChIJz13Dp9Wscm0Ru9TUvS4DslY   Dental World Papakura -37.063301  174.943527   

   reviews  
0       39  
1        3  
2       11  
3        9  
4        7  


In [24]:
sample_size = int(len(property_data) * sample_percentage)  # Sample 10% of the rows

# Define property_for_huff which can be set to either the full data or a sample
def set_property_for_huff(full_data, sample_size=None):
    if sample_size:
        return full_data.sample(n=sample_size, random_state=42)  # Create a random sample
    return full_data  # Use the full dataset if no sample_size is provided

# Example usage
# For full data (use this in the main run)
property_for_huff = set_property_for_huff(property_data,sample_size=100000)

# For testing with a random sample of, say, 1000 rows (use this for faster testing)
# property_for_huff = set_property_for_huff(property_data, sample_size=1000)

# Display the first few rows to verify
print(property_for_huff.head())


                                           WKT   longitude   latitude
1440305     POINT (173.9377888167 -41.5075379)  173.937789 -41.507538
1921502  POINT (174.3626960667 -35.7519257167)  174.362696 -35.751926
2140332  POINT (175.0096198333 -37.2376353167)  175.009620 -37.237635
246221      POINT (172.5985556167 -43.4973139)  172.598556 -43.497314
1196567  POINT (175.6296664333 -40.3469563333)  175.629666 -40.346956


In [25]:
from math import sqrt

# Ensure 'reviews' column exists and calculate the size (proxy) as sqrt of reviews
dentists_df['size'] = dentists_df['reviews'].apply(lambda x: sqrt(x)) + 20

# Display to check
print(dentists_df[['name', 'reviews', 'size']].head())


                     name  reviews       size
0             Mint Dental       39  26.244998
1  Pukekohe Orthodontists        3  21.732051
2   Vanessa Wright Dental       11  23.316625
3         The Denture Man        9  23.000000
4   Dental World Papakura        7  22.645751


In [26]:
import numpy as np

# Function to calculate distance between two points (Haversine formula)
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r  # Distance in kilometers

# Test the function
print(haversine(-36.8485, 174.7633, -36.8484, 174.7634))  # Test distance in Auckland


0.014241458474257587


In [None]:
from tqdm import tqdm


# Convert property data and dentists data to NumPy arrays for fast operations
property_lats = property_for_huff['latitude'].values
property_lons = property_for_huff['longitude'].values

dentist_lats = dentists_df['lat'].values
dentist_lons = dentists_df['lon'].values
dentist_sizes = dentists_df['size'].values

# Initialize an array to store the total estimated revenue for each dentist
revenue_per_practice = np.zeros(len(dentist_lats))

# Vectorized calculation of distances from all properties to all dentists
for i in tqdm(range(len(property_lats)), desc="Processing Properties", total=len(property_lats)):
    # Compute distances from the current property to all dentists at once
    distances = haversine(property_lats[i], property_lons[i], dentist_lats, dentist_lons)

    # Compute size over distance^beta (for all dentists)
    size_over_distance = dentist_sizes / distances**beta
    
    # Compute the probabilities (for all dentists)
    probabilities = size_over_distance / size_over_distance.sum()

    # Distribute customer value to each dental practice based on the probabilities
    revenue_per_practice += probabilities * customer_value



In [19]:
# Scale the revenue by the sampling ratio (to account for partial data)
revenue_per_practice /= sample_percentage

# Add the revenue estimate to the dentists DataFrame
dentists_df['estimated_revenue'] = revenue_per_practice



In [20]:
from IPython.display import display
pd.set_option('display.max_rows', None)

# Sort the DataFrame by 'estimated_revenue' first (while it's still numeric)
sorted_dentists_df = dentists_df.sort_values(by='estimated_revenue', ascending=False)

# Truncate the name at 30 characters and create a new column for the formatted revenue
sorted_dentists_df['name_truncated'] = sorted_dentists_df['name'].apply(lambda x: x[:30])
sorted_dentists_df['formatted_revenue'] = sorted_dentists_df['estimated_revenue'].map(lambda x: f"${x:,.0f}")

# Select relevant columns for display
formatted_output = sorted_dentists_df[['name_truncated', 'formatted_revenue']]

# Display the formatted DataFrame
display(formatted_output)


Unnamed: 0,name_truncated,formatted_revenue
161,Smile Dental - Queen Street -,"$325,189"
35,Maxcare Dental - Otahuhu,"$286,087"
72,Donna Lim Orthodontics,"$235,568"
12,Smile Dental - Manukau,"$228,518"
0,Mint Dental,"$225,835"
69,Stoddard Dental Square ! Emerg,"$218,777"
134,Pt Chevalier Family Dentist,"$214,568"
41,DentalCare West,"$211,217"
45,Blockhouse Bay Dental Centre,"$208,300"
129,CM Dental Ltd.,"$207,944"


In [11]:
dentists_df.head()

Unnamed: 0,place_id,name,lat,lon,reviews,size,estimated_revenue
0,ChIJn2iTgIGqEm0R3tcUFgH1VDY,Mint Dental,-37.191588,174.903652,39,16.244998,2258345.0
1,ChIJZ1N2dWKqEm0RZV_x3mljNxU,Pukekohe Orthodontists,-37.074346,174.922603,3,11.732051,1181371.0
2,ChIJPx17MdWscm0RaYxnKNKfAcg,Vanessa Wright Dental,-37.062908,174.940487,11,13.316625,1391686.0
3,ChIJiQWVRdSscm0RgB5b7NnhmNM,The Denture Man,-37.064653,174.943538,9,13.0,1387053.0
4,ChIJz13Dp9Wscm0Ru9TUvS4DslY,Dental World Papakura,-37.063301,174.943527,7,12.645751,1330511.0
