# Clustering

### Setup data

In [7]:
from db.connection import get_connection
import pandas as pd

query = f"""
    SELECT 
    a.report_id as rid, 
    a.occurred as occ, 
    b.embedding as emb, 
    c.latitude as lat, 
    c.longitude as long
    FROM 
    ufo_reports_transform a
    JOIN 
    description_averaged_embeddings b
    ON a.report_id = b.report_id
    JOIN 
    city_county_lat_lon c
    ON 
        TRIM(SPLIT_PART(a.location, ',', 1)) = TRIM(c.city) AND 
        TRIM(SPLIT_PART(a.location, ',', 2)) = TRIM(c.state)
    WHERE occurred is not null
    ;
    """

conn = get_connection()
    
try:
    df = pd.read_sql(query, conn)
except Exception as e:
    print(f"Error executing query: {e}")
    df = None
finally:
    conn.close()


print(df.head)

2024-11-22 23:41:14,527 [INFO] db.connection: Database connection established.
  df = pd.read_sql(query, conn)


<bound method NDFrame.head of             rid         occ  \
0       S161267  1606962600   
1       S144647  1548320400   
2        S98221  1369822500   
3        S41177   337932000   
4        S33851   337924800   
...         ...         ...   
105018   S31615   336889500   
105019  S145401   203745600   
105020   S47792   -47448000   
105021   S35366   337404600   
105022  S144682  1548601800   

                                                      emb        lat  \
0       [0.016444052,0.06212723,0.049611054,0.07362398...  40.837518   
1       [0.033369765,-0.022478051,0.020196505,0.001629...  38.802758   
2       [0.04027589,-0.015154102,0.007161611,0.0472713...  33.908275   
3       [0.028260397,0.04181325,0.024015397,0.04231337...  41.180778   
4       [0.04406667,0.040626522,0.0037232835,0.0134104...  44.225891   
...                                                   ...        ...   
105018  [0.025904214,0.029731477,0.010769772,0.0461099...  39.047133   
105019  [0.031506382,

### DBSCAN

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def prepare_clustering_data(df):
    embedding_array = np.stack(df['embedding_vector'].apply(eval).to_numpy())
    occurred = df['occurred'].values.reshape(-1, 1)
    lat_lon = df[['latitude', 'longitude']].values
    combined_features = np.hstack([occurred, embedding_array, lat_lon])
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(combined_features)
    return scaled_features

def cluster_with_dbscan(data, eps=0.5, min_samples=10):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
    cluster_labels = dbscan.fit_predict(data)
    return cluster_labels

def visualize_clusters(df, cluster_labels):
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(
        df['longitude'], 
        df['latitude'], 
        c=cluster_labels, 
        cmap='tab20', 
        alpha=0.7, 
        s=5
    )
    plt.colorbar(scatter, label="Cluster Label")
    plt.title("Clustering by Latitude and Longitude")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.show()


scaled_features = prepare_clustering_data(df)

cluster_labels = cluster_with_dbscan(scaled_features, eps=2.0, min_samples=50)

df['cluster'] = cluster_labels

visualize_clusters(df, cluster_labels)


KeyError: 'embedding'