In [20]:
import pickle
import numpy as np
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import csv
import json

def csv_to_geojson(filename, output_geojson, n_clusters=10):
    # for i in range(0,n_clusters):
    geojson = {
    "type": "FeatureCollection",
    "features": []
    }
    with open(filename, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # print(int(row['cluster']))
            # if int(row['cluster'])==i:
            feature = {
                "type": "Feature",
                "geometry": {
                    "type": "Point",
                    "coordinates": [float(row['lon']), float(row['lat'])]
                },
                "properties": {
                    "label": row['label'],
                    # "PC1": float(row['PC1']),
                    # "PC2": float(row['PC2']),
                    # "PC3": float(row['PC3']),
                    "cluster": int(row['cluster']),
                    # "path": row['path']
                }
            }
            geojson["features"].append(feature)
                # break
        geojson_file_path = output_geojson
        # print(geojson_file_path)

        with open(geojson_file_path, 'w', encoding='utf-8') as geojsonfile:
            json.dump(geojson, geojsonfile, ensure_ascii=False, indent=4)

#### PCA (if needed)

In [2]:
def apply_pca(embeddings, n_components=20,start_component=5):
    pca = PCA(n_components)
    reduced_embeddings = pca.fit_transform(embeddings)
    print(f'Variance {pca.explained_variance_ratio_}')
    print(f'Variance {pca.explained_variance_ratio_[start_component:].sum()}')


    return reduced_embeddings

### Clustering algo

In [3]:
def apply_clustering(embeddings, labels, lat, lon, n_clusters, filename):
    print(embeddings.shape)
    print("K-Mean in process")
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(embeddings)

    df = pd.DataFrame({
    'label': labels,
    'cluster': clusters,
    'lat': lat,
    'lon':lon
    })

    df.to_csv(filename, index=False)

In [33]:
import ast

def convert_to_list(embedding_str):
    return ast.literal_eval(embedding_str)


filename1=f'data/weather-embedding.csv'
data1 = pd.read_csv(filename1)
data1['embedding'] = data1['embedding'].apply(convert_to_list)
embeddings1 = np.array(data1['embedding'].tolist())
labels = data1['cell-id'].values
print(len(labels), (embeddings1.shape))


filename2=f'data/elevation-crops-embedding.csv'
data2 = pd.read_csv(filename2)
data2_fil = data2[data2['cell-id'].isin(labels)].copy()  

data2_fil['embedding'] = data2_fil['embedding'].apply(convert_to_list)
embeddings2 = np.array(data2_fil['embedding'].tolist())
print(len(data2), (embeddings2.shape))

filename3=f'data/.csv'
data3 = pd.read_csv(filename3)
data3_fil = data3[data3['cell-id'].isin(labels)].copy()  

data3_fil['embedding'] = data3_fil['embedding'].apply(convert_to_list)
embeddings3 = np.array(data3_fil['embedding'].tolist())
print(len(data3), (embeddings3.shape))


concat_embeddings = np.concatenate((embeddings1, embeddings2, embeddings2), axis=1)
print(concat_embeddings.shape)

4864 (4864, 32)
4890 (4864, 512)
(4864, 544)


##### Apply PCA if needed

In [38]:
n_components=16  # PCA components
reduced_embeddings = apply_pca(concat_embeddings, n_components=n_components)


Variance [0.52566341 0.09806223 0.08015789 0.03286504 0.02897634 0.02729958
 0.02600662 0.02173407 0.01989275 0.01695706 0.01484541 0.01280482
 0.01256415 0.01088958 0.00901266 0.00758065]
Variance 0.17958734211385247


In [39]:
coord_data = pd.read_csv('data/location.csv')
coord_data_sub = coord_data[coord_data['cell-id'].isin(labels)]

lat = coord_data_sub['lat']
lon = coord_data_sub['lon']

In [40]:

output_geojson = '/home/savvas/SUPER-NAS/USERS/Chirag/TEMP-Folder/emb-cluster/cumm-cluter.geojson'
n_clusters = 7 

apply_clustering(concat_embeddings, labels=labels, lat=lat, lon=lon, n_clusters=n_clusters, filename='cumm-cluster.csv')


(4864, 544)
K-Mean in process




In [41]:
filename='cumm-cluster.csv'
csv_to_geojson(filename,output_geojson, n_clusters=n_clusters)
