# Clean up the KML file

In [None]:
# open the KML file and read the content
with open("1.kml", "r", encoding='utf-8') as f:
    kml_content = f.read()

In [None]:
# import the regular expression module
import re

# find all the coordinates in the KML content
coords = re.findall("<coordinates>(.*?)</coordinates>", kml_content)

In [None]:
# loop through the coordinates
for coord in coords:
    # split the coordinate by commas
    values = coord.split(",")
    # keep only the first two values (longitude and latitude)
    new_coord = ",".join(values[:2])
    # replace the old coordinate with the new one in the KML content
    kml_content = kml_content.replace(coord, new_coord)

In [None]:
# write the modified KML content to a new file
with open("2.kml", "w",encoding='utf-8') as f:
    f.write(kml_content)

# Clustering by proximity with K-Means-Constrained

In [None]:
#!pip install k-means-constrained

Defaulting to user installation because normal site-packages is not writeable


In [None]:
# import the libraries
import numpy as np
from xml.etree import ElementTree as ET
from sklearn.cluster import KMeans
import pandas as pd
from pykml.factory import KML_ElementMaker as KML
from lxml import etree

# read the KML file and parse it
tree = ET.parse("2.kml")
root = tree.getroot()

# get the list of placemarks
placemarks = root.findall(".//{http://www.opengis.net/kml/2.2}Placemark")

# create an empty list to store the data
data = []

# loop through the placemarks and extract the data
for placemark in placemarks:
    # get the name and coordinates of the placemark
    name = placemark.findtext(".//{http://www.opengis.net/kml/2.2}name")
    coordinates = placemark.findtext(".//{http://www.opengis.net/kml/2.2}coordinates")
    # split the coordinates string and convert to floats
    lon, lat = map(float, coordinates.split(","))
    # get the other data from the placemark
    data.append([name, lat, lon, ET.tostring(placemark)])

# convert the data list to a pandas dataframe
df = pd.DataFrame(data, columns=["name", "latitude", "longitude", "placemark"])

# convert the coordinates to a numpy array
X = df[["latitude", "longitude"]].to_numpy()

# apply the balanced k-means algorithm with 7 clusters
kmeans = KMeansConstrained(n_clusters=7, size_min=100, size_max=150, random_state=0)
kmeans.fit(X)

# get the cluster labels
labels = kmeans.labels_

# assign the labels to the dataframe
df["cluster"] = labels

# create a dictionary of dataframes, one for each cluster
dict_dataframes = dict(tuple(df.groupby("cluster")))

# create a new KML file with 6 folders, one for each cluster
kml_file = KML.kml(
    KML.Document()
)

# loop through the dictionary of dataframes
for cluster, df in dict_dataframes.items():
    # create a folder for the cluster
    folder = KML.Folder(
        KML.name(f"Cluster {cluster}")
    )
    # loop through the rows of the dataframe
    for index, row in df.iterrows():
        # add the original placemark to the folder
        folder.append(etree.fromstring(row["placemark"]))
    # add the folder to the document
    kml_file.Document.append(folder)

# write the KML file to disk
with open("clusters.kml", "w") as f:
    f.write(etree.tostring(kml_file, pretty_print=True).decode())


from k_means_constrained import KMeansConstrained

# apply the balanced k-means algorithm with 7 clusters
kmeans = KMeansConstrained(n_clusters=7, size_min=100, size_max=150, random_state=0)
kmeans.fit(X)

# get the cluster labels
labels = kmeans.labels_

# assign the labels to the dataframe
df["cluster"] = labels

# create a dictionary of dataframes, one for each cluster
dict_dataframes = dict(tuple(df.groupby("cluster")))

# create a new KML file with 6 folders, one for each cluster
kml_file = KML.kml(
    KML.Document()
)

# loop through the dictionary of dataframes
for cluster, df in dict_dataframes.items():
    # create a folder for the cluster
    folder = KML.Folder(
        KML.name(f"Cluster {cluster}")
    )
    # loop through the rows of the dataframe
    for index, row in df.iterrows():
        # create a placemark for the address
        placemark = KML.Placemark(
            KML.name(row["name"]),
            KML.Point(
                KML.coordinates(f"{row['longitude']},{row['latitude']}")
            )
        )
        # add the placemark to the folder
        folder.append(placemark)
    # add the folder to the document
    kml_file.Document.append(folder)