# Clustering Exploration 

Experimenting with clustering Galveston Indoor/Outdoor points to get routes for each day. 

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import csv

In [2]:
trucks = pd.read_csv('../data/truck_service.csv')
trucks.head()

Unnamed: 0,Name,Longitude,Latitude,Daily_Pickup_Totes,Weekly_Dropoff_Totes,user_ratings_total,category,rating,weekly_high,weekly_low,best_weekly_estimate,Address,geometry,Adoption,Adoption Guess,Visited,Influence,location_type,id,pickup_type
0,Moody Gardens,-94.8523,29.2736,0.0,0.0,,,,,,,,,,,,,depot,,
1,Sugar Bean,-94.870418,29.261029,1.0,1.0,476.0,cafe,4.7,200.0,50.0,200.0,"11 Evia Main, Galveston",POINT (-94.8704183 29.2610286),Curious,High,No,High,indoor,,
2,MAC-IES,-94.791385,29.298912,1.0,1.0,73.0,restaurant,5.0,1500.0,1050.0,1500.0,"1110 23rd Street, Galveston",POINT (-94.7913851 29.2989118),Curious,High,No,Low,indoor,,
3,Good Dough,-94.808531,29.290612,1.0,1.0,68.0,bakery,4.6,70.0,35.0,70.0,"1508 39th Street, Galveston",POINT (-94.808531 29.2906119),Curious,High,No,Med,indoor,,
4,Porch Cafe,-94.750192,29.319581,1.0,1.0,332.0,restaurant,4.1,300.0,,100.0,"1625 East Beach Drive #2A, Galveston",POINT (-94.7501916 29.3195806),Curious,High,No,Low,indoor,,


In [3]:
X = trucks[['Name', 'Longitude', 'Latitude']]

kmeans = KMeans(n_clusters = 5, init ='k-means++')
kmeans.fit(X[X.columns[1:3]]) # Compute k-means clustering.
X['cluster_number'] = kmeans.fit_predict(X[X.columns[1:3]])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers.
labels = kmeans.predict(X[X.columns[1:3]]) # Labels of each point
X.head(10)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['cluster_number'] = kmeans.fit_predict(X[X.columns[1:3]])


Unnamed: 0,Name,Longitude,Latitude,cluster_number
0,Moody Gardens,-94.8523,29.2736,2
1,Sugar Bean,-94.870418,29.261029,2
2,MAC-IES,-94.791385,29.298912,0
3,Good Dough,-94.808531,29.290612,3
4,Porch Cafe,-94.750192,29.319581,0
5,Coastal Grill,-94.789455,29.308158,0
6,Texas A&M University at Galveston,-94.818287,29.315497,0
7,Rudy & Paco Restaurant and Bar,-94.791081,29.305336,0
8,Vargas Cut and Catch (listed above),-94.791385,29.305227,0
9,MarMo Cafe & Lounge,-94.792103,29.3056,0


In [4]:
X.cluster_number.unique()

array([2, 0, 3, 1, 4], dtype=int32)

In [5]:
X['cluster_number'].value_counts()

0    119
3     75
2     35
1      9
4      5
Name: cluster_number, dtype: int64

In [6]:
def redistribute_clusters(X, kmeans, max_iterations=100):
    for _ in range(max_iterations):
        cluster_counts = X['cluster_number'].value_counts().sort_index()
        min_cluster_size = cluster_counts.min()
        max_cluster_size = cluster_counts.max()
        
        # Find the cluster with the maximum size and the one with the minimum size
        max_cluster = cluster_counts.idxmax()
        min_cluster = cluster_counts.idxmin()
        
        # Get the points from the maximum cluster
        max_cluster_points = X[X['cluster_number'] == max_cluster]
        
        # Find the point in the maximum cluster that is closest to the centroid of the minimum cluster
        min_cluster_center = kmeans.cluster_centers_[min_cluster]
        closest_point = max_cluster_points.apply(lambda row: np.linalg.norm(row[['Longitude', 'Latitude']] - min_cluster_center), axis=1).idxmin()
        
        # Reassign the closest point from the maximum cluster to the minimum cluster
        X.at[closest_point, 'cluster_number'] = min_cluster
        
    return X

clusteredPoints = redistribute_clusters(X, kmeans, max_iterations=100)


In [41]:
clusteredPoints['cluster_number'].value_counts()
clustered_data = trucks.merge(clusteredPoints, left_on='Name', right_on='Name')
clustered_data = clustered_data.drop(['Longitude_y', 'Latitude_y'], axis=1)

In [42]:
clustered_data

Unnamed: 0,Name,Longitude_x,Latitude_x,Daily_Pickup_Totes,Weekly_Dropoff_Totes,user_ratings_total,category,rating,weekly_high,weekly_low,...,Address,geometry,Adoption,Adoption Guess,Visited,Influence,location_type,id,pickup_type,cluster_number
0,Moody Gardens,-94.852300,29.273600,0.0,0.0,,,,,,...,,,,,,,depot,,,2
1,Sugar Bean,-94.870418,29.261029,1.0,1.0,476.0,cafe,4.7,200,50,...,"11 Evia Main, Galveston",POINT (-94.8704183 29.2610286),Curious,High,No,High,indoor,,,2
2,MAC-IES,-94.791385,29.298912,1.0,1.0,73.0,restaurant,5.0,1500,1050,...,"1110 23rd Street, Galveston",POINT (-94.7913851 29.2989118),Curious,High,No,Low,indoor,,,3
3,Good Dough,-94.808531,29.290612,1.0,1.0,68.0,bakery,4.6,70,35,...,"1508 39th Street, Galveston",POINT (-94.808531 29.2906119),Curious,High,No,Med,indoor,,,4
4,Porch Cafe,-94.750192,29.319581,1.0,1.0,332.0,restaurant,4.1,300,,...,"1625 East Beach Drive #2A, Galveston",POINT (-94.7501916 29.3195806),Curious,High,No,Low,indoor,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5350,,-94.956086,29.196348,1.0,0.0,,,,,,...,,,,,,,outdoor,3034.0,Truck,2
5351,,-94.956086,29.196348,1.0,0.0,,,,,,...,,,,,,,outdoor,3034.0,Truck,2
5352,,-94.956086,29.196348,1.0,0.0,,,,,,...,,,,,,,outdoor,3034.0,Truck,1
5353,,-94.956086,29.196348,1.0,0.0,,,,,,...,,,,,,,outdoor,3034.0,Truck,1


In [43]:
#clustered_data.to_csv ('../data/clustered_data.csv', index=None, header = True)

In [12]:
trucks['Weekly_Dropoff_Totes'].sum()/5
#each cluster needs to have around 40 totes to drop off

40.6

In [13]:
trucks['Daily_Pickup_Totes'].sum()/5
#each truck needs to have at least 62 totes they're picking up. 

62.2

In [21]:
# Number of bins is 5, one for each day 
num_bins = 5 

# Sort the DataFrame by the specific column in descending order
trucks = trucks.sort_values(by='Daily_Pickup_Totes', ascending=False)

# Calculate the desired sum for each bin
total_sum = trucks['Daily_Pickup_Totes'].sum()
desired_bin_sum = total_sum / num_bins

# Create bins and assign bin labels
bins = []
current_sum = 0
bin_label = 1

for index, row in trucks.iterrows():
    current_sum += row['Daily_Pickup_Totes']
    bins.append(bin_label)
    if current_sum >= desired_bin_sum:
        current_sum = 0
        bin_label += 1

# Add the 'bin' column to the DataFrame
trucks['bin'] = bins

# Sort the DataFrame back to its original order
trucks = trucks.sort_index()

In [18]:
trucks['bin'].value_counts()

4    63
3    63
5    60
2    39
1    18
Name: bin, dtype: int64

In [19]:
trucks.groupby('bin')['Daily_Pickup_Totes'].sum()

bin
1    63.0
2    63.0
3    63.0
4    63.0
5    59.0
Name: Daily_Pickup_Totes, dtype: float64

In [20]:
trucks.groupby('bin')['Weekly_Dropoff_Totes'].sum()

bin
1    45.0
2    37.0
3    16.0
4    46.0
5    59.0
Name: Weekly_Dropoff_Totes, dtype: float64

In [30]:
trucks.to_csv ('../data/pickup_totes_clusters.csv', index=None, header = True)

### Same process from above but for dropoff

In [22]:
# Number of bins is 5, one for each day 
num_bins = 5 

# Sort the DataFrame by the specific column in descending order
trucks2 = trucks.sort_values(by='Weekly_Dropoff_Totes', ascending=False)

# Calculate the desired sum for each bin
total_sum2 = trucks2['Weekly_Dropoff_Totes'].sum()
desired_bin_sum2 = total_sum2 / num_bins

# Create bins and assign bin labels
bins2 = []
current_sum2 = 0
bin_label = 1

for index, row in trucks2.iterrows():
    current_sum2 += row['Weekly_Dropoff_Totes']
    bins2.append(bin_label)
    if current_sum2 >= desired_bin_sum2:
        current_sum2 = 0
        bin_label += 1

# Add the 'bin' column to the DataFrame
trucks2['bin'] = bins2

# Sort the DataFrame back to its original order
trucks2 = trucks2.sort_index()

In [23]:
trucks2['bin'].value_counts()

5    112
4     41
3     41
2     37
1     12
Name: bin, dtype: int64

In [24]:
trucks2.groupby('bin')['Daily_Pickup_Totes'].sum()

bin
1     51.0
2     49.0
3     50.0
4     44.0
5    117.0
Name: Daily_Pickup_Totes, dtype: float64

In [26]:
trucks2.groupby('bin')['Weekly_Dropoff_Totes'].sum()

bin
1    41.0
2    41.0
3    41.0
4    41.0
5    39.0
Name: Weekly_Dropoff_Totes, dtype: float64

In [31]:
trucks2.to_csv ('../data/dropoff_totes_clusters.csv', index=None, header = True)

In [27]:
trucks

Unnamed: 0,Name,Longitude,Latitude,Daily_Pickup_Totes,Weekly_Dropoff_Totes,user_ratings_total,category,rating,weekly_high,weekly_low,...,Address,geometry,Adoption,Adoption Guess,Visited,Influence,location_type,id,pickup_type,bin
0,Moody Gardens,-94.852300,29.273600,0.0,0.0,,,,,,...,,,,,,,depot,,,5
1,Sugar Bean,-94.870418,29.261029,1.0,1.0,476.0,cafe,4.7,200,50,...,"11 Evia Main, Galveston",POINT (-94.8704183 29.2610286),Curious,High,No,High,indoor,,,5
2,MAC-IES,-94.791385,29.298912,1.0,1.0,73.0,restaurant,5.0,1500,1050,...,"1110 23rd Street, Galveston",POINT (-94.7913851 29.2989118),Curious,High,No,Low,indoor,,,4
3,Good Dough,-94.808531,29.290612,1.0,1.0,68.0,bakery,4.6,70,35,...,"1508 39th Street, Galveston",POINT (-94.808531 29.2906119),Curious,High,No,Med,indoor,,,4
4,Porch Cafe,-94.750192,29.319581,1.0,1.0,332.0,restaurant,4.1,300,,...,"1625 East Beach Drive #2A, Galveston",POINT (-94.7501916 29.3195806),Curious,High,No,Low,indoor,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,,-94.878118,29.238725,1.0,0.0,,,,,,...,,,,,,,outdoor,3030.0,Truck,3
239,,-94.896043,29.227723,1.0,0.0,,,,,,...,,,,,,,outdoor,3031.0,Truck,3
240,,-94.927823,29.210431,1.0,0.0,,,,,,...,,,,,,,outdoor,3032.0,Truck,3
241,,-94.954036,29.193525,1.0,0.0,,,,,,...,,,,,,,outdoor,3033.0,Truck,3


In [28]:
trucks2

Unnamed: 0,Name,Longitude,Latitude,Daily_Pickup_Totes,Weekly_Dropoff_Totes,user_ratings_total,category,rating,weekly_high,weekly_low,...,Address,geometry,Adoption,Adoption Guess,Visited,Influence,location_type,id,pickup_type,bin
0,Moody Gardens,-94.852300,29.273600,0.0,0.0,,,,,,...,,,,,,,depot,,,5
1,Sugar Bean,-94.870418,29.261029,1.0,1.0,476.0,cafe,4.7,200,50,...,"11 Evia Main, Galveston",POINT (-94.8704183 29.2610286),Curious,High,No,High,indoor,,,2
2,MAC-IES,-94.791385,29.298912,1.0,1.0,73.0,restaurant,5.0,1500,1050,...,"1110 23rd Street, Galveston",POINT (-94.7913851 29.2989118),Curious,High,No,Low,indoor,,,4
3,Good Dough,-94.808531,29.290612,1.0,1.0,68.0,bakery,4.6,70,35,...,"1508 39th Street, Galveston",POINT (-94.808531 29.2906119),Curious,High,No,Med,indoor,,,4
4,Porch Cafe,-94.750192,29.319581,1.0,1.0,332.0,restaurant,4.1,300,,...,"1625 East Beach Drive #2A, Galveston",POINT (-94.7501916 29.3195806),Curious,High,No,Low,indoor,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,,-94.878118,29.238725,1.0,0.0,,,,,,...,,,,,,,outdoor,3030.0,Truck,5
239,,-94.896043,29.227723,1.0,0.0,,,,,,...,,,,,,,outdoor,3031.0,Truck,5
240,,-94.927823,29.210431,1.0,0.0,,,,,,...,,,,,,,outdoor,3032.0,Truck,5
241,,-94.954036,29.193525,1.0,0.0,,,,,,...,,,,,,,outdoor,3033.0,Truck,5
