In [212]:
import rdflib
import nltk
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import numpy as np

In [213]:
file_path = 'kaggle_data.ttl'
graph = rdflib.Graph()

In [214]:
graph.parse(file_path, format='ttl')

<Graph identifier=Nde24043195e44d8b9acf8c8ea34bfac8 (<class 'rdflib.graph.Graph'>)>

In [216]:
descriptions = []
for subject in graph.subjects(rdflib.RDF.type, rdflib.URIRef("http://www.w3.org/ns/dcat#Dataset")):
    description = graph.value(subject, rdflib.URIRef("http://www.w3.org/ns/dcat#description"))
    if description:
        descriptions.append(str(description))

In [217]:
def get_stopwords(language):
    if language == 'en':
        return set(stopwords.words('english'))
    elif language == 'id':
        return set(stopwords.words('indonesian'))
    else:
        return set(stopwords.words('english'))

In [218]:
def preprocess_text(text):
    # Deteksi bahasa dari teks
    language = detect(text)
    
    # Pilih stopwords yang sesuai berdasarkan bahasa
    stop_words = get_stopwords(language)
    
    # Tokenize teks dan hapus stopwords serta tanda baca
    tokens = word_tokenize(text.lower())  # Mengubah teks menjadi lowercase
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    
    return tokens

In [219]:
model = Word2Vec(sentences=processed_descriptions, vector_size=100, window=5, min_count=1, workers=4)

In [220]:
print(f"Vektor untuk deskripsi pertama:")
print(model.wv[processed_descriptions[0][0]]) 

Vektor untuk deskripsi pertama:
[-0.00536277 -0.00655301 -0.00772686  0.00848864 -0.00198664 -0.0069392
 -0.00395922  0.00527567 -0.00302814 -0.00387022  0.00175496 -0.0028354
 -0.00155322  0.00104516 -0.00290165  0.0085095   0.00400042 -0.01001861
  0.00606488 -0.00692831  0.00081072  0.00450551 -0.00480644 -0.00210955
  0.00809131 -0.00422534 -0.00766526  0.00929787 -0.00225674 -0.00475391
  0.00869525  0.00420321  0.00437402  0.00924373 -0.00848911  0.00540764
  0.00215635  0.00420983  0.00166616  0.00444793  0.00457372  0.00614367
 -0.00330293 -0.00464487 -0.00032022  0.00254502 -0.00333482  0.00616129
  0.0041869   0.00778726  0.0025661   0.0081052  -0.0014254   0.00800724
  0.00367634 -0.008121   -0.00389475 -0.00251249  0.00492651 -0.00093869
 -0.0028437   0.00783066  0.00953491 -0.00170527 -0.00523883 -0.00467714
 -0.00491853 -0.00943604  0.0011914  -0.00431092  0.00253706  0.00572932
 -0.0039924  -0.00950438  0.00150195 -0.00678263  0.00244961 -0.00378511
  0.00700449  0.00054

In [221]:
from sklearn.metrics.pairwise import cosine_similarity

vector_1 = np.mean([model.wv[word] for word in processed_descriptions[0] if word in model.wv], axis=0)
vector_2 = np.mean([model.wv[word] for word in processed_descriptions[1] if word in model.wv], axis=0)

In [222]:
similarity = cosine_similarity([vector_1], [vector_2])
print(f"Cosine similarity antara deskripsi pertama dan kedua: {similarity[0][0]}")

Cosine similarity antara deskripsi pertama dan kedua: 1.0


In [223]:
model.save("word2vec_model.model")

In [224]:
import numpy as np 

def get_semantic_features(description, model):
    tokens = preprocess_text(description)
    vector = np.mean([model.wv[word] for word in tokens if word in model.wv], axis=0)
    return vector if vector is not None else np.zeros(model.vector_size)

In [225]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

In [226]:
def combine_features(semantic_feature, structural_features):
    # Gabungkan fitur semantik dan struktural menjadi satu vektor
    combined_feature = np.concatenate([semantic_feature, structural_features], axis=0)
    return combined_feature

In [227]:
model = Word2Vec.load("word2vec_model.model")

In [228]:
def get_semantic_features(description, model):
    tokens = preprocess_text(description)
    vector = np.mean([model.wv[word] for word in tokens if word in model.wv], axis=0)
    return vector if vector is not None else np.zeros(model.vector_size)

In [229]:
categories = []
formats = []
creators = []

In [230]:
for subject in graph.subjects(rdflib.RDF.type, rdflib.URIRef("http://www.w3.org/ns/dcat#Dataset")):
    category = graph.value(subject, rdflib.URIRef("http://www.w3.org/ns/dcat#theme"))
    format_ = graph.value(subject, rdflib.URIRef("http://www.w3.org/ns/dcat#mediaType"))

    if category: categories.append(str(category))  
    if format_: formats.append(str(format_))

In [231]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_categories = encoder.fit_transform(categories) 
encoded_formats = encoder.fit_transform(formats)

In [232]:
print(f"Panjang descriptions: {len(descriptions)}")
print(f"Panjang encoded_categories: {len(encoded_categories)}")
print(f"Panjang encoded_formats: {len(encoded_formats)}")

Panjang descriptions: 20
Panjang encoded_categories: 18
Panjang encoded_formats: 20


In [233]:
default_category = -1 
default_format = "N/A"

In [234]:
all_features = []
for i in range(len(descriptions)):
    # Cek apakah kategori dan format ada
    if i < len(encoded_categories) and i < len(encoded_formats):
        category = encoded_categories[i]
        format_ = encoded_formats[i]
    else:
        category = default_category  # Jika tidak ada kategori, gunakan nilai default
        format_ = default_format  # Jika tidak ada format, gunakan nilai default
    
    # Ambil fitur semantik (vektor 1D)
    semantic_feature = get_semantic_features(descriptions[i], model)  
    
    # Gabungkan fitur semantik dan struktural menjadi satu vektor (pastikan keduanya 1D)
    structural_features = np.array([category, format_])  # Gunakan kategori dan format
    combined_feature = np.concatenate([semantic_feature.flatten(), structural_features.flatten()], axis=0)  
    
    # Simpan fitur gabungan
    all_features.append(combined_feature)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [235]:
import pandas as pd
combined_features_df = pd.DataFrame(all_features, columns=[f'feature_{i+1}' for i in range(len(all_features[0]))])

In [236]:
print(combined_features_df)

       feature_1     feature_2    feature_3    feature_4    feature_5  \
0      -0.006503      0.007063      0.00054     0.005804     0.006954   
1            NaN           2.0          1.0         None         None   
2       0.000007      0.003125    -0.006785    -0.001309     0.007658   
3      -0.005363     -0.006553    -0.007727     0.008489    -0.001987   
4      -0.008807      0.002166    -0.000856     -0.00925    -0.009446   
5      -0.003091      0.001294     0.000469     0.003388    -0.000438   
6            NaN           6.0          1.0         None         None   
7      -0.008807      0.002166    -0.000856     -0.00925    -0.009446   
8            NaN           1.0          0.0         None         None   
9            NaN           8.0          3.0         None         None   
10     -0.009758      0.009032     0.004203      0.00936     0.006601   
11      0.000007      0.003125    -0.006785    -0.001309     0.007658   
12     -0.002678     -0.001714    -0.007256      0.

In [237]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [238]:
from sklearn.metrics.pairwise import cosine_similarity

# Ganti nilai 'N/A' dengan 0 di seluruh DataFrame
combined_features_df = combined_features_df.replace("N/A", 0)

# Pastikan DataFrame hanya berisi nilai numerik
combined_features_df = combined_features_df.apply(pd.to_numeric, errors='coerce').fillna(0)

# Sekarang, kita dapat menghitung similarity matrix
similarity_matrix = cosine_similarity(combined_features_df)

# Tampilkan similarity matrix
print(similarity_matrix)


[[ 1.00000000e+00  1.21795676e-03  9.18567266e-01  3.70797933e-01
   9.58240993e-01  9.98242913e-01  1.31021466e-03  9.60773835e-01
   1.31157264e-03  1.26327749e-03  9.67368055e-01  9.63787914e-01
   9.98206883e-01  9.90825412e-01  1.31257365e-03  9.56067307e-01
   9.54183428e-01  9.96217172e-01 -9.26449512e-01 -9.26449512e-01]
 [ 1.21795676e-03  1.00000000e+00 -1.68998306e-04 -9.30178436e-03
   1.40707557e-04  4.32242409e-04  9.55779009e-01  1.54644746e-04
   8.94427191e-01  9.94505453e-01  1.23514424e-03 -2.64139826e-05
  -1.51080829e-03  1.17854691e-03  9.48683298e-01  1.29066778e-04
   1.19199626e-04 -1.94946087e-03  9.94178586e-03  9.94178586e-03]
 [ 9.18567266e-01 -1.68998306e-04  1.00000000e+00  7.05334964e-01
   7.67602016e-01  8.93781374e-01  1.38985988e-03  7.73332223e-01
   2.20801983e-03  3.84171713e-04  7.88751503e-01  7.80473819e-01
   8.93984591e-01  8.57083996e-01  1.50786384e-03  7.62771694e-01
   7.58645845e-01  9.47771810e-01 -7.04886110e-01 -7.04886110e-01]
 [ 3.70

In [239]:
similarity_matrix = cosine_similarity(combined_features_df)

In [240]:
print(similarity_matrix)

[[ 1.00000000e+00  1.21795676e-03  9.18567266e-01  3.70797933e-01
   9.58240993e-01  9.98242913e-01  1.31021466e-03  9.60773835e-01
   1.31157264e-03  1.26327749e-03  9.67368055e-01  9.63787914e-01
   9.98206883e-01  9.90825412e-01  1.31257365e-03  9.56067307e-01
   9.54183428e-01  9.96217172e-01 -9.26449512e-01 -9.26449512e-01]
 [ 1.21795676e-03  1.00000000e+00 -1.68998306e-04 -9.30178436e-03
   1.40707557e-04  4.32242409e-04  9.55779009e-01  1.54644746e-04
   8.94427191e-01  9.94505453e-01  1.23514424e-03 -2.64139826e-05
  -1.51080829e-03  1.17854691e-03  9.48683298e-01  1.29066778e-04
   1.19199626e-04 -1.94946087e-03  9.94178586e-03  9.94178586e-03]
 [ 9.18567266e-01 -1.68998306e-04  1.00000000e+00  7.05334964e-01
   7.67602016e-01  8.93781374e-01  1.38985988e-03  7.73332223e-01
   2.20801983e-03  3.84171713e-04  7.88751503e-01  7.80473819e-01
   8.93984591e-01  8.57083996e-01  1.50786384e-03  7.62771694e-01
   7.58645845e-01  9.47771810e-01 -7.04886110e-01 -7.04886110e-01]
 [ 3.70

In [241]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine') 

In [242]:
knn.fit(combined_features_df)

In [414]:
dataset_id = 0 
distances, indices = knn.kneighbors([combined_features_df.iloc[dataset_id]])



In [415]:
print(f"Rekomendasi untuk dataset '{descriptions[dataset_id]}':")
for i in range(1, len(indices[0])):  # Mulai dari 1 untuk menghindari merekomendasikan dirinya sendiri
    recommended_dataset = indices[0][i]
    print(f"- {descriptions[recommended_dataset]} (Jarak: {distances[0][i]:.4f})")

Rekomendasi untuk dataset 'Bahasa Isyarat Indonesia (BISINDO) is a sign language that applies in Indonesia.':
- The Indonesia Election News Dataset 2024 (Berita Pemilu 2024) in detik.com (Jarak: 0.0018)
- Contains all the stocks in Jakarta Composite Index/IHSG (minutes, hourly, daily) (Jarak: 0.0018)
- Aggregated from 3 news sources (Jarak: 0.0038)
- 📊 Hospital Data in Indonesia 🏥 (Complete with Class, Ownership and Capacity)🏥 (Jarak: 0.0092)


In [416]:
dataset_i = 1
distances, indices = knn.kneighbors([combined_features_df.iloc[dataset_i]])



In [417]:
print(f"Rekomendasi untuk dataset '{descriptions[dataset_i]}':")
for i in range(1, len(indices[0])):  # Mulai dari 1 untuk menghindari merekomendasikan dirinya sendiri
    recommended_dataset = indices[0][i]
    print(f"- {descriptions[recommended_dataset]} (Jarak: {distances[0][i]:.1f})")

Rekomendasi untuk dataset 'Hourly measurement of air pollution':
- Plate number of Indramayu / Cirebon Area (E) (Jarak: 0.0)
- Klasifikasi Tingkat Kemiskinan Menggunakan Regresi Logistik (Jarak: 0.0)
- Scraped from rumah123.com (20/03/2024 - 21/05/2024) (Jarak: 0.1)
- Multi-Labeled Hate Speech and Abusive Indonesian Twitter Text by okkyibrohim (Jarak: 0.1)


In [247]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [248]:
titles = []

In [249]:
for subject in graph.subjects(rdflib.RDF.type, rdflib.URIRef("http://www.w3.org/ns/dcat#Dataset")):
    title = graph.value(subject, rdflib.URIRef("http://www.w3.org/ns/dcat#title"))
    if title and description:
        titles.append(str(title))

In [384]:
from googletrans import Translator

def recommend_datasets_by_keyword(keyword, titles, descriptions, top_n=5):
    keyword_lower = keyword.lower()  # Menggunakan keyword dalam lowercase
    relevant_datasets = []
    
    # Menggunakan Google Translate untuk terjemahan otomatis
    translator = Translator()
    keyword_en = translator.translate(keyword, src='id', dest='en').text.lower()  # Terjemahkan ke bahasa Inggris
    
    for idx, description in enumerate(descriptions):
        # Cek apakah keyword dalam bahasa Indonesia atau terjemahannya dalam bahasa Inggris ada di deskripsi atau judul
        if keyword_lower in description.lower() or keyword_en in description.lower() or \
            keyword_lower in titles[idx].lower() or keyword_en in titles[idx].lower():
            relevant_datasets.append((titles[idx], descriptions[idx]))  # Menambahkan title dan description
    
    # Mengembalikan daftar yang relevan
    return relevant_datasets[:top_n]

In [385]:
keyword = "saham" 
recommend_datasets_by_keyword(keyword, titles, descriptions)

[('Dataset Saham Indonesia / Indonesia Stock Dataset',
  'Contains all the stocks in Jakarta Composite Index/IHSG (minutes, hourly, daily)'),
 ('Pergerakan Saham di Bursa Efek Indonesia',
  'Pergerakan saham-saham di Bursa Efek Indonesia')]

In [386]:
keyword = "polusi" 
recommend_datasets_by_keyword(keyword, titles, descriptions)

[('Air Quality in Yogyakarta, Indonesia (2021)',
  'Hourly measurement of air pollution')]

In [405]:
keyword = "indonesia" 
recommend_datasets_by_keyword(keyword, titles, descriptions)

[('Bahasa Isyarat Indonesia (BISINDO) Alphabets',
  'Bahasa Isyarat Indonesia (BISINDO) is a sign language that applies in Indonesia.'),
 ('Air Quality in Yogyakarta, Indonesia (2021)',
  'Hourly measurement of air pollution'),
 ('Population of Indonesia (2050-1955)',
  'Historical Population. Population Forecast, Population in Major Cities'),
 ('Data Wilayah Republic Indonesia',
  'Kaggle is the world’s largest data science community with powerful tools and resources to help you achieve your data science goals.'),
 ('Dataset Tanaman Padi Sumatera, Indonesia',
  'Dataset Hasil Produksi dari tahun 1993-2020')]

In [None]:
def precision_at_k(recommended, relevant, k=5):
    recommended_at_k = recommended[:k]
    relevant_set = set(relevant)
    recommended_set_at_k = set(recommended_at_k)
    
    # Menghitung presisi
    return len(recommended_set_at_k.intersection(relevant_set)) / len(recommended_set_at_k) if recommended_set_at_k else 0

In [388]:
# Fungsi untuk menghitung recall
def recall_at_k(recommended, relevant, k=5):
    recommended_at_k = recommended[:k]  # Ambil k rekomendasi pertama
    relevant_set = set(relevant)  # Set dari ground truth relevan
    
    # Menghitung recall
    return len(set(recommended_at_k).intersection(relevant_set)) / len(relevant_set) if relevant_set else 0

In [389]:
def ndcg_at_k(recommended, relevant, k=5):
    recommended_at_k = recommended[:k]
    
    # Menghitung NDCG
    dcg = sum(1 / np.log2(i + 2) for i in range(len(recommended_at_k)) if recommended_at_k[i] in relevant)
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)) if relevant[i] in recommended_at_k)
    
    return dcg / idcg if idcg > 0 else 0

In [409]:
def evaluate_recommendations(keyword, ground_truth, titles, descriptions, top_n=5):
    # Mendapatkan rekomendasi dari kata kunci
    recommended_datasets = recommend_datasets_by_keyword(keyword, titles, descriptions, top_n)  # Ambil rekomendasi
    
    # Menghitung evaluasi
    recommended_titles = [r[0] for r in recommended_datasets]  # Ambil hanya judul rekomendasi
    
    # Evaluasi
    precision = precision_at_k(recommended_titles, ground_truth, k=top_n)
    recall = recall_at_k(recommended_titles, ground_truth, k=top_n)
    ndcg = ndcg_at_k(recommended_titles, ground_truth, k=top_n)
    
    # Menampilkan hasil evaluasi
    print(f"Presisi: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"NDCG: {ndcg:.4f}")

In [None]:
ground_truth = [
    "Air Quality in Yogyakarta, Indonesia (2021)"
]

In [397]:
evaluate_recommendations("pollution", ground_truth, titles, descriptions, top_n=5)

Presisi: 1.0000
Recall: 1.0000
NDCG: 1.0000


In [411]:
ground_truth_saham = [
    "Dataset Saham Indonesia / Indonesia Stock Dataset",  # Referensi: muamkh/ihsgstockdata
    "Saham IDX"
]

In [412]:
evaluate_recommendations("saham", ground_truth_saham, titles, descriptions, top_n=5)

Presisi: 0.5000
Recall: 0.5000
NDCG: 1.0000


In [408]:
ground_truth_indonesia = [
    "Bahasa Isyarat Indonesia (BISINDO) Alphabets",
    "Air Quality in Yogyakarta, Indonesia (2021)",
    "Population of Indonesia (2050-1955)",
    "Data Wilayah Republic Indonesia",
    "Dataset Tanaman Padi Sumatera, Indonesia"
]

In [413]:
evaluate_recommendations("indonesia", ground_truth_indonesia, titles, descriptions, top_n=5)

Presisi: 1.0000
Recall: 1.0000
NDCG: 1.0000


In [418]:
import pickle

# Simpan model Word2Vec
with open("word2vec_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Simpan hasil rekomendasi (jika diinginkan)
with open("recommendations.pkl", "wb") as f:
    pickle.dump(recommended_datasets, f)