# Import Packages and Load Clustering Data

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import folium as fl
from folium.plugins import MarkerCluster
import numpy as np
import pandas as pd
import random

In [14]:
plant_data_future_bioclim = pd.read_csv("../raw_data/plant_data_inkl_bioclim/plant_data_future_bioclim.csv")
plant_data_future_prec = pd.read_csv("../raw_data/plant_data_inkl_bioclim/plant_data_future_prec.csv")
plant_data_future_tmin = pd.read_csv("../raw_data/plant_data_inkl_bioclim/plant_data_future_tmin.csv")
plant_data_future_tmax = pd.read_csv("../raw_data/plant_data_inkl_bioclim/plant_data_future_tmax.csv")

In [15]:
bclim_feature_list = ["bioc"+str(i) for i in range(1,20)]
prec_list = ["prec"+str(i) for i in range(1,13)]
tmin_list = ["tmin"+str(i) for i in range(1,13)]
tmax_list = ["tmax"+str(i) for i in range(1,13)]

In [16]:
# Scaling
scaler = StandardScaler()
scaled_plant_data_future_tmin = scaler.fit_transform(plant_data_future_tmin[tmin_list])
# Initialize K-Means
num_clusters = 10
kmeans = KMeans(
    init="random",
    n_clusters=num_clusters,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [17]:
%%time
kmeans.fit(scaled_plant_data_future_tmin)
plant_data_future_tmin['Cluster_No'] = kmeans.labels_
# plant_data_future_tmin.head(3)

# Get the cluster partition numbers in descending order
cluster_count = plant_data_future_tmin['Cluster_No'].value_counts()
cluster_count = cluster_count.rename_axis('Cluster_No').reset_index(name='Counts')

CPU times: user 6.22 s, sys: 163 ms, total: 6.39 s
Wall time: 1.68 s


In [18]:
# Cluster colors
# Generate random colors
rand_colors =np.array([ "#"+''.join([random.choice('ABCDEF0123456789') for i in range(6)]) for j in range(num_clusters)])

# Set the colors for different clusters
cluster_count["Color"] = rand_colors
# The first color corresponds to the largest group,..., the last one to the smallest group

# Getting the color vector arranged by cluster number
colors_vec = cluster_count.sort_values(['Cluster_No'])[["Color"]].values.reshape(-1,).tolist()
# colors_vec

In [23]:
%%time
# Create a map for the clustered data
# Map center
center = [plant_data_future_tmin['decimalLatitude'].mean(),
          plant_data_future_tmin['decimalLongitude'].mean()]

# Create a map
map_clustered_data = fl.Map(location = center, zoom_start = 1.7)

plant_data_future_tmin.apply(lambda row:fl.CircleMarker(
    location=[row["decimalLatitude"],row["decimalLongitude"]],
    radius = 0.8,fill=True,
    color = colors_vec[row["Cluster_No"]],
    popup = fl.Popup("tmin1 = "+str(row["tmin1"])+' C',max_width=115)).add_to(map_clustered_data),axis=1)

CPU times: user 2min 22s, sys: 4.28 s, total: 2min 27s
Wall time: 2min 26s


0        <folium.vector_layers.CircleMarker object at 0...
1        <folium.vector_layers.CircleMarker object at 0...
2        <folium.vector_layers.CircleMarker object at 0...
3        <folium.vector_layers.CircleMarker object at 0...
4        <folium.vector_layers.CircleMarker object at 0...
                               ...                        
74372    <folium.vector_layers.CircleMarker object at 0...
74373    <folium.vector_layers.CircleMarker object at 0...
74374    <folium.vector_layers.CircleMarker object at 0...
74375    <folium.vector_layers.CircleMarker object at 0...
74376    <folium.vector_layers.CircleMarker object at 0...
Length: 74377, dtype: object

In [24]:
%%time
map_clustered_data.save("../raw_data/plant_data_inkl_bioclim/tmin_map.html")

CPU times: user 3min 16s, sys: 3.46 s, total: 3min 19s
Wall time: 3min 19s
