# **Import Library**

In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from itertools import combinations
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score




# **Load Data**

In [2]:
file_path = "datasets/data_inovasi_clean.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,no,judul_inovasi,pemda,admin_opd,inisiator,nama_inisiator,bentuk_inovasi,jenis,asta_cipta,urusan_utama,...,kematangan,tahapan_inovasi,tanggal_input,tanggal_penerapan,tanggal_pengembangan,video,link_video,label_kematangan,lat,lon
0,1,POP SURGA (Penghantaran Obat Pasien Sumberglagah),Provinsi Jawa Timur,Dinas Kesehatan UPT. RSK Sumberglagah Mojokert...,Kepala Daerah,"drg. SHINTA SAWITRI, M.Kes",Inovasi Pelayanan Publik,Digital,Memperkuat pembangunan sumber daya manusia (SD...,Kesehatan,...,85.0,Penerapan,2022-07-22,2023-11-29,,Ada,https://www.youtube.com/watch?v=o_TedznOu3U,Sangat Inovatif,-7.607391,112.540787
1,2,PHEC (Pre Hospital Emergency Care),Provinsi Jawa Timur,Dinas Kesehatan UPT. RSK Sumberglagah Mojokert...,OPD,"drg. SHINTA SAWITRI, M.Kes",Inovasi Pelayanan Publik,Non Digital,Memperkuat pembangunan sumber daya manusia (SD...,Pendidikan,...,92.0,Penerapan,2022-07-22,2023-06-24,,Ada,https://youtu.be/TJaII4_0UkI,Sangat Inovatif,-7.607311,112.540723
2,3,Naskah Dinas Elektronik (NADINE),Provinsi Jawa Timur,Badan Koordinasi Wilayah Pamekasan Provinsi Ja...,OPD,"Dra. SUFI AGUSTINI, M.Si",Inovasi Tata Kelola Pemerintahan Daerah,Teknologi,"Memperkuat reformasi politik, hukum, dan birok...",Kearsipan,...,110.0,Penerapan,2023-06-20,2024-12-31,2024-12-31,Ada,https://drive.google.com/file/d/192dhJGWtC4lDc...,Sangat Inovatif,-7.158554,113.482726
3,4,PERMATA ( Pertanian Ramah Lingkungan menuju Ma...,Provinsi Jawa Timur,Dinas Pertanian dan Ketahanan Pangan (jatimpro...,OPD,Tidak Disebutkan,Inovasi Daerah Lainnya Sesuai Dengan Urusan Pe...,Digital,Tidak Diisi,Pertanian,...,60.0,Uji Coba,2023-06-26,2023-09-25,,Tidak Ada,-,Inovatif,,
4,5,SIGALON,Provinsi Jawa Timur,Dinas Kesehatan UPT. RS Mata Masyarakat (jatim...,OPD,Tidak Disebutkan,Inovasi Pelayanan Publik,Digital,Tidak Diisi,Kesehatan,...,52.0,Penerapan,2023-07-10,2023-07-10,,Ada,https://drive.google.com/file/d/1JKCIqN4TRVL-O...,Inovatif,,


# **Text Embedding**

## Menggabungkan Fitur

In [3]:
df["teks_fitur"] = (
    "Judul: " + df["judul_inovasi"] + ". " +
    "Urusan: " + df["urusan_utama"] + ". " +
    "Tahapan: " + df["tahapan_inovasi"] + ". " +
    "Kematangan: " + df["label_kematangan"]
)

## Embedding Vektor

In [4]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(df["teks_fitur"].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

## **Encoding**

In [5]:
le_urusan = LabelEncoder()
le_tahapan = LabelEncoder()
le_kematangan = LabelEncoder()

df["urusan_enc"] = le_urusan.fit_transform(df["urusan_utama"])
df["tahapan_enc"] = le_tahapan.fit_transform(df["tahapan_inovasi"])
df["kematangan_enc"] = le_kematangan.fit_transform(df["label_kematangan"])

fitur_tambahan = df[["urusan_enc", "tahapan_enc", "kematangan_enc"]].values

In [6]:
X = np.hstack([embeddings, fitur_tambahan])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# **Model Clustering**

## Mencari Jumlah Cluster Terbaik (Benchmark)

In [7]:
results = []

# KMEANS
for k in range(2, 8):
    model = KMeans(n_clusters=k, random_state=42)
    labels = model.fit_predict(embeddings)

    results.append({
        "Model": f"KMeans k={k}",
        "Silhouette": silhouette_score(embeddings, labels),
        "Calinski-Harabasz": calinski_harabasz_score(embeddings, labels),
        "Davies-Bouldin": davies_bouldin_score(embeddings, labels)
    })

# AGGLOMERATIVE
for k in range(2, 8):
    model = AgglomerativeClustering(n_clusters=k)
    labels = model.fit_predict(embeddings)

    results.append({
        "Model": f"Agglomerative k={k}",
        "Silhouette": silhouette_score(embeddings, labels),
        "Calinski-Harabasz": calinski_harabasz_score(embeddings, labels),
        "Davies-Bouldin": davies_bouldin_score(embeddings, labels)
    })

# DBSCAN
dbscan = DBSCAN(eps=0.7, min_samples=5)
labels = dbscan.fit_predict(embeddings)

if len(set(labels)) > 1 and -1 not in set(labels):
    results.append({
        "Model": "DBSCAN",
        "Silhouette": silhouette_score(embeddings, labels),
        "Calinski-Harabasz": calinski_harabasz_score(embeddings, labels),
        "Davies-Bouldin": davies_bouldin_score(embeddings, labels)
    })

df_results = pd.DataFrame(results).sort_values(by="Silhouette", ascending=False)
best_row = df_results.iloc[0]
best_model_name = best_row["Model"]
df_results

Unnamed: 0,Model,Silhouette,Calinski-Harabasz,Davies-Bouldin
8,Agglomerative k=4,0.084817,29.451706,3.052642
0,KMeans k=2,0.083766,53.860971,3.352194
7,Agglomerative k=3,0.082502,34.256228,3.277395
6,Agglomerative k=2,0.081338,42.331249,3.744928
3,KMeans k=5,0.077171,32.411157,3.080403
10,Agglomerative k=6,0.070416,24.323352,3.288002
9,Agglomerative k=5,0.065817,26.71931,3.500689
2,KMeans k=4,0.065099,35.943797,3.336734
1,KMeans k=3,0.060522,38.409453,3.337063
11,Agglomerative k=7,0.046166,22.536179,3.288368


In [8]:
def parse_model_info(model_name):
    if "KMeans" in model_name:
        k = int(re.search(r'k=(\d+)', model_name).group(1))
        return "kmeans", k
    elif "Agglomerative" in model_name:
        k = int(re.search(r'k=(\d+)', model_name).group(1))
        return "agglomerative", k
    elif "DBSCAN" in model_name:
        return "dbscan", None

model_type, best_k = parse_model_info(best_model_name)

if model_type == "kmeans":
    final_model = KMeans(n_clusters=best_k, random_state=42)
elif model_type == "agglomerative":
    final_model = AgglomerativeClustering(n_clusters=best_k)
elif model_type == "dbscan":
    final_model = DBSCAN(eps=0.7, min_samples=5)

df["cluster"] = final_model.fit_predict(embeddings)

In [9]:
# from sklearn.cluster import AgglomerativeClustering

# cluster_model = AgglomerativeClustering(n_clusters=4)
# df["cluster"] = cluster_model.fit_predict(embeddings)

# **Similarity**

In [10]:
sim_matrix = cosine_similarity(embeddings)

## Skor Kolaborasi

In [11]:
def skor_pasangan(i, j):
    sim_score = sim_matrix[i, j]
    urusan_score = 1 if df.loc[i, "urusan_utama"] == df.loc[j, "urusan_utama"] else 0
    tahapan_score = 1 if df.loc[i, "tahapan_inovasi"] == df.loc[j, "tahapan_inovasi"] else 0.5

    return (0.5 * sim_score) + (0.25 * urusan_score) + (0.25 * tahapan_score)

## Skor untuk 2-3 Inovasi

In [12]:
def skor_grup(indices):
    pairs = list(combinations(indices, 2))
    skor_list = [skor_pasangan(i, j) for i, j in pairs]
    return np.mean(skor_list)

## Top Kolaborasi

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

hasil_kolaborasi = []

for cluster_id in df["cluster"].unique():
    anggota_idx = df[df["cluster"] == cluster_id].index.tolist()
    emb_cluster = embeddings[anggota_idx]

    # Hitung centroid cluster
    centroid = emb_cluster.mean(axis=0).reshape(1, -1)

    # Hitung jarak ke centroid
    sim_ke_centroid = cosine_similarity(emb_cluster, centroid).flatten()

    # Ambil TOP 10 inovasi paling representatif
    top_n = 10
    top_indices_local = np.argsort(sim_ke_centroid)[-top_n:]
    anggota_terpilih = [anggota_idx[i] for i in top_indices_local]

    for r in [2, 3]:
        for combo in combinations(anggota_terpilih, r):
            skor = skor_grup(combo)

            hasil_kolaborasi.append({
                "cluster": cluster_id,
                "anggota": combo,
                "jumlah_inovasi": r,
                "skor": skor
            })

kolaborasi_df = pd.DataFrame(hasil_kolaborasi)

## Filter Top per Cluster

In [14]:
top_per_cluster = (
    kolaborasi_df
    .sort_values("skor", ascending=False)
    .groupby("cluster")
    .head(2)  # ambil 2 terbaik tiap cluster
)

In [15]:
# Filter agar inovasi yang sama tidak muncul terus
used_inovasi = set()
final_rekomendasi = []

for _, row in top_per_cluster.sort_values("skor", ascending=False).iterrows():
    anggota_set = set(row["anggota"])

    if len(used_inovasi.intersection(anggota_set)) <= 1:
        final_rekomendasi.append(row)
        used_inovasi.update(anggota_set)

final_df = pd.DataFrame(final_rekomendasi).sort_values("skor", ascending=False).head(5)

In [16]:
for _, row in final_df.iterrows():
    print("\n==============================")
    print(f"Cluster {row['cluster']} | Skor Kolaborasi: {row['skor']:.3f}")
    print(f"Jumlah Inovasi: {row['jumlah_inovasi']}")
    print("------------------------------")

    for idx in row["anggota"]:
        print("•", df.loc[idx, "judul_inovasi"], "|",
              df.loc[idx, "urusan_utama"], "|",
              df.loc[idx, "tahapan_inovasi"], "|",
              df.loc[idx, "label_kematangan"])


Cluster 0 | Skor Kolaborasi: 0.991
Jumlah Inovasi: 2
------------------------------
• APLIKASI VOGASI | Pendidikan | Penerapan | Sangat Inovatif
• BUPERTIWI | Pendidikan | Penerapan | Sangat Inovatif

Cluster 3 | Skor Kolaborasi: 0.965
Jumlah Inovasi: 2
------------------------------
• SIGALON | Kesehatan | Penerapan | Inovatif
• SIAP ANTAR OBAT TANPA BIAYA (SIAPA) | Kesehatan | Penerapan | Sangat Inovatif

Cluster 3 | Skor Kolaborasi: 0.964
Jumlah Inovasi: 2
------------------------------
• SIGALON | Kesehatan | Penerapan | Inovatif
• GERIATRI PRIORITAS (GETAS) | Kesehatan | Penerapan | Sangat Inovatif

Cluster 1 | Skor Kolaborasi: 0.954
Jumlah Inovasi: 2
------------------------------
• ECODAYA | Pendidikan | Penerapan | Sangat Inovatif
• BESTIRAMLI MELALUI ECOBRICK | Pendidikan | Penerapan | Sangat Inovatif

Cluster 2 | Skor Kolaborasi: 0.945
Jumlah Inovasi: 2
------------------------------
• Sistem Informasi Kendaraan Dinas (SIKENDIS) | Kearsipan | Penerapan | Sangat Inovatif
• SI