In [27]:
sample_percentage = 1.00  # % of the data
beta = 1.2  # Distance decay parameter
customer_value = 150  # Household dental spending per year


In [28]:
import pandas as pd
import pickle


In [29]:
import math

def ratings_to_size(rating):
    return (math.sqrt(rating) + 20) / 10

In [30]:

# Load the property coordinates from the Parquet file
property_data = pd.read_parquet('data/nz_property_lat_lon.parquet')
property_data = property_data.drop(columns=['WKT'])

# Load the deduplicated dentists
with open('data/deduplicated_dentists.pkl', 'rb') as f:
    deduplicated_dentists = pickle.load(f)

# Convert deduplicated_dentists to DataFrame for easier manipulation
dentists_df = pd.DataFrame(deduplicated_dentists)
dentists_df['size'] = dentists_df['reviews'].apply(ratings_to_size)

# Display a few rows to verify
display(property_data.head())
display(dentists_df.head())


Unnamed: 0,longitude,latitude
0,172.682428,-43.569101
1,174.766058,-36.850536
2,174.766289,-36.848008
3,174.770643,-36.847417
4,174.753391,-36.860142


Unnamed: 0,place_id,name,lat,lon,reviews,size
0,ChIJn2iTgIGqEm0R3tcUFgH1VDY,Mint Dental,-37.191588,174.903652,39,2.6245
1,ChIJZ1N2dWKqEm0RZV_x3mljNxU,Pukekohe Orthodontists,-37.074346,174.922603,3,2.173205
2,ChIJZ6Hm7Aqtcm0R7hsARw807lE,Papakura Dental,-37.063744,174.942566,7,2.264575
3,ChIJiQWVRdSscm0RgB5b7NnhmNM,The Denture Man,-37.064653,174.943538,9,2.3
4,ChIJz13Dp9Wscm0Ru9TUvS4DslY,Dental World Papakura,-37.063301,174.943527,7,2.264575


In [31]:
sample_size = int(len(property_data) * sample_percentage)  # Sample % of the rows

# Define property_for_huff which can be set to either the full data or a sample
def set_property_for_huff(full_data, sample_size=None):
    if sample_size:
        return full_data.sample(n=sample_size, random_state=42)  # Create a random sample
    return full_data  # Use the full dataset if no sample_size is provided

# Example usage
# For full data (use this in the main run)
property_for_huff = set_property_for_huff(property_data, sample_size)

# For testing with a random sample of, say, 1000 rows (use this for faster testing)
# property_for_huff = set_property_for_huff(property_data, sample_size=1000)

# Display the first few rows to verify
display(property_for_huff.head())


Unnamed: 0,longitude,latitude
1440305,173.937789,-41.507538
1921502,174.362696,-35.751926
2140332,175.00962,-37.237635
246221,172.598556,-43.497314
1196567,175.629666,-40.346956


In [None]:
import numpy as np

# Function to calculate distance between two points (Haversine formula)
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r  # Distance in kilometers

# Test the function


In [21]:
from tqdm import tqdm


# Convert property data and dentists data to NumPy arrays for fast operations
property_lats = property_for_huff['latitude'].values
property_lons = property_for_huff['longitude'].values

dentist_lats = dentists_df['lat'].values
dentist_lons = dentists_df['lon'].values
dentist_sizes = dentists_df['size'].values

# Initialize an array to store the total estimated revenue for each dentist
revenue_per_practice = np.zeros(len(dentist_lats))

# Vectorized calculation of distances from all properties to all dentists
for i in tqdm(range(len(property_lats)), desc="Processing Properties", total=len(property_lats)):
    # Compute distances from the current property to all dentists at once
    distances = haversine(property_lats[i], property_lons[i], dentist_lats, dentist_lons)

    # Compute size over distance^beta (for all dentists)
    size_over_distance = dentist_sizes / distances**beta
    
    # Compute the probabilities (for all dentists)
    probabilities = size_over_distance / size_over_distance.sum()

    # Distribute customer value to each dental practice based on the probabilities
    revenue_per_practice += probabilities * customer_value



Processing Properties: 100%|██████████| 2360762/2360762 [01:17<00:00, 30400.84it/s]


In [22]:
property_lats.shape

(2360762,)

In [23]:
# Scale the revenue by the sampling ratio (to account for partial data)
revenue_per_practice /= sample_percentage

# Add the revenue estimate to the dentists DataFrame
dentists_df['estimated_revenue'] = revenue_per_practice



In [24]:
from IPython.display import display
pd.set_option('display.max_rows', None)

# Sort the DataFrame by 'estimated_revenue' first (while it's still numeric)
sorted_dentists_df = dentists_df.sort_values(by='estimated_revenue', ascending=False)

# Truncate the name at 30 characters and create a new column for the formatted revenue
sorted_dentists_df['name_truncated'] = sorted_dentists_df['name'].apply(lambda x: x[:30])
sorted_dentists_df['formatted_revenue'] = sorted_dentists_df['estimated_revenue'].map(lambda x: f"${x:,.0f}")

# Select relevant columns for display
formatted_output = sorted_dentists_df[['name_truncated', 'formatted_revenue']]

# Display the formatted DataFrame
display(formatted_output)


Unnamed: 0,name_truncated,formatted_revenue
161,Smile Dental - Queen Street -,"$3,158,736"
35,Maxcare Dental - Otahuhu,"$2,757,974"
72,Donna Lim Orthodontics,"$2,445,740"
69,Stoddard Dental Square ! Emerg,"$2,295,505"
134,Pt Chevalier Family Dentist,"$2,252,393"
12,Smile Dental - Manukau,"$2,233,162"
142,Dental Artistry - Auckland Den,"$2,204,358"
154,Accent Dentists - Cosmetic Den,"$2,193,310"
183,Northmed Dental,"$2,188,944"
45,Blockhouse Bay Dental Centre,"$2,175,671"


In [25]:
display(dentists_df)

Unnamed: 0,place_id,name,lat,lon,reviews,size,estimated_revenue
0,ChIJn2iTgIGqEm0R3tcUFgH1VDY,Mint Dental,-37.191588,174.903652,39,2.6245,1738140.0
1,ChIJZ1N2dWKqEm0RZV_x3mljNxU,Pukekohe Orthodontists,-37.074346,174.922603,3,2.173205,1323660.0
2,ChIJPx17MdWscm0RaYxnKNKfAcg,Vanessa Wright Dental,-37.062908,174.940487,11,2.331662,1490731.0
3,ChIJiQWVRdSscm0RgB5b7NnhmNM,The Denture Man,-37.064653,174.943538,9,2.3,1475807.0
4,ChIJz13Dp9Wscm0Ru9TUvS4DslY,Dental World Papakura,-37.063301,174.943527,7,2.264575,1449864.0


In [26]:
property_data.shape

(2360762, 2)

In [26]:
1+1