In [1]:
sparql_endpoint = "http://localhost:9999/blazegraph/namespace/final-test/sparql"

In [2]:
import os
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm import tqdm

In [7]:
def get_sparql_data(sparql_query):
    print("SPARQL ENDPOINT로부터 데이터를 불러옵니다.")
    endpoint = SPARQLWrapper(sparql_endpoint)
    endpoint.setQuery(sparql_query)
    endpoint.setReturnFormat(JSON)
    result = endpoint.query().convert()
    return pd.json_normalize(result["results"]["bindings"])

def refined_keywords(text):
    refined_list = []
    for n in text.split(','):
        if n != '':
            refined_list.append(n.strip())
    return refined_list

In [8]:
def kistib_df():
    df = get_sparql_data(
        """
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX dct: <http://purl.org/dc/terms/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX dcat: <http://www.w3.org/ns/dcat#>

        SELECT DISTINCT ?uri ?title ?desc ?theme ?keyword ?publisher ?contact ?url ?format ?issuedDate ?modifiedDate
        WHERE { 
            ?uri a dcat:Dataset ;
                dct:title ?title ;
                dcat:keyword ?keyword .
        
            OPTIONAL { ?uri dct:description ?desc . }
            OPTIONAL { ?uri dcat:theme/rdfs:label ?theme . }
            OPTIONAL { ?uri dcat:accessURL ?url . }
            OPTIONAL { ?uri dct:publisher ?publisher . }
            OPTIONAL { ?uri dcat:contactPoint ?contact . }
            OPTIONAL { ?uri dct:format/rdfs:label ?format . }
            OPTIONAL { ?uri dct:issued ?issuedDate . }
            OPTIONAL { ?uri dct:modified ?modifiedDate . }
        }
        """
    )

    # value 컬럼만 넣기
    columns = [n for n in df.columns if n.find(".value") != -1]
    df = df[columns]

    # 컬럼명에서 .value 삭제
    df = df.rename(columns=lambda col: col.replace(".value", ""))
    df = df.fillna("None")
    #print(df.head())
    print(df.columns)

    df['keyword'] = df['keyword'].apply(refined_keywords)

    df.to_csv("final-data/csv/disaster-dataset.csv", encoding="utf-8", index=False)
    
    return df

In [9]:
dis_df = kistib_df()
len(dis_df)

SPARQL ENDPOINT로부터 데이터를 불러옵니다.
Index(['uri', 'title', 'desc', 'theme', 'keyword', 'publisher', 'contact',
       'url', 'issuedDate', 'modifiedDate'],
      dtype='object')


826