In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from geopy.distance import geodesic
import numpy as np
import pandas as pd

df_uploaded = pd.read_csv('https://raw.githubusercontent.com/gsdavis1959/Data/master/world-cities-lat-long.csv')
df_uploaded.head()


Unnamed: 0,name,country.etc,pop,lat,long,capital
0,'Abasan al-Jadidah,Palestine,5629,31.31,34.34,0
1,'Abasan al-Kabirah,Palestine,18999,31.32,34.35,0
2,'Abdul Hakim,Pakistan,47788,30.55,72.11,0
3,'Abdullah-as-Salam,Kuwait,21817,29.36,47.98,0
4,'Abud,Palestine,2456,32.03,35.07,0


In [None]:
# Function to convert lat/long to miles for DBSCAN (approximation)
def latlong_to_miles(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).miles

# Approximate radius of Earth in miles
EARTH_RADIUS_MILES = 3958.8

In [None]:
# Convert lat and long to radians for haversine metric
coords_radians = np.radians(df_uploaded[['lat', 'long']].values)

# Run DBSCAN with haversine distance metric
# The epsilon needs to be in radians for geographical coordinates when using haversine
# Approximate conversion of 50 miles to radians (1 mile = 1/3958.8 radians)
eps_in_radians = 50 / EARTH_RADIUS_MILES

dbscan = DBSCAN(eps=eps_in_radians, min_samples=1, metric='haversine')
df_uploaded['cluster'] = dbscan.fit_predict(coords_radians)

# For each cluster, find the city with the largest population
cluster_labels = df_uploaded.groupby('cluster')['pop'].idxmax()

# Create a DataFrame with cluster labels and the largest city in each cluster
largest_cities = df_uploaded.loc[cluster_labels, ['cluster', 'name', 'country.etc', 'pop']]

# Sort the DataFrame by cluster label for better readability
largest_cities_sorted = largest_cities.sort_values(by='cluster').reset_index(drop=True)

largest_cities_sorted.head(), largest_cities_sorted.shape


(   cluster      name   country.etc       pop
 0        0    Moscow        Russia  10472629
 1        1   Karachi      Pakistan  11969284
 2        2  Hargeysa       Somalia    500710
 3        3    'Adale       Somalia      5492
 4        4     'Afif  Saudi Arabia     41731,
 (1850, 4))

In [None]:
# prompt: save largest)cities_sorted as csv

largest_cities_sorted.to_csv('largest_cities_sorted.csv')


cluster
0    34722
Name: pop, dtype: int64