In [4]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.run_query import run_query

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

### Total number of triples

In [4]:
sparql_query = """
SELECT COUNT(*) as ?totalTriples
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,totalTriples.value
0,244081830


### Unique subjects

In [3]:
sparql_query = """
SELECT (COUNT(DISTINCT ?s) as ?numSubjects)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numSubjects.value
0,10486690


### Unique predicates

In [2]:
sparql_query = """
SELECT (COUNT(DISTINCT ?p) as ?numPredicates)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numPredicates.value
0,144


### Unique objects

In [3]:
sparql_query = """
SELECT COUNT(DISTINCT ?o) as ?numObjects
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numObjects.value
0,61449476


### Most prevalent predicate 

In [14]:
sparql_query = """
SELECT ?p COUNT(?p) as ?predicateCount
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
GROUP BY ?p
ORDER BY DESC(?predicateCount)
LIMIT 2
"""
run_query(sparql_query, as_dataframe=True)


Unnamed: 0,p.value,predicateCount.value
0,https://schema.org/knows,105225474
1,https://schema.org/checkinTime,13353332


### Most prevalent class

In [15]:
sparql_query = """
SELECT ?class (COUNT(DISTINCT ?s) as ?numSubjects)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s rdfs:Class ?class .
}
GROUP BY ?class
ORDER BY DESC(?numSubjects)
LIMIT 2
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,class.value,numSubjects.value
0,https://schema.org/UserReview,6990280
1,https://schema.org/Person,1987897


### Average in-degree

https://stackoverflow.com/questions/24270532/how-to-calculate-maximum-degree-of-a-directed-graph-using-sparql

In [6]:
sparql_query = """SELECT (AVG(?indegree) AS ?avgIndegree) (AVG(?outdegree) AS ?avgOutdegree)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  {
    SELECT ?node (COUNT(?in) AS ?indegree)
    FROM <http://www.yelpkg.com/yelp_kg>
    WHERE {
      ?in ?p ?node .
    }
    GROUP BY ?node
  }
  {
    SELECT ?node (COUNT(?out) AS ?outdegree)
    FROM <http://www.yelpkg.com/yelp_kg>
    WHERE {
      ?node ?p ?out .
    }
    GROUP BY ?node
  }
}"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,avgIndegree.value,avgOutdegree.value
0,3.805331513937019,6.337879517583192


In [14]:
sparql_query = """
SELECT ?o (*) as ?indegree
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o
}
GROUP BY ?o
ORDER BY DESC(?indegree)
LIMIT 10
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,avgIndegree.value
0,2.562649956151269


### Average out-degree

In [23]:
sparql_query = """
SELECT ?s COUNT(DISTINCT ?o) as ?outdegree
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
GROUP BY ?s
ORDER BY DESC(?outdegree)
LIMIT 10
"""

outdeg = run_query(sparql_query, as_dataframe=True)
outdeg

Unnamed: 0,s.value,outdegree.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-QI8Qi8XWH3D8y8ethnajA,52148
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,40123
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/Eb1XmmLWyt_way5NNZ7-Pw,37573
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/c_4c5rJECZSfNgFj7frwHQ,37535
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/4i4kmYm9wgSNyF1b6gKphg,31183
5,https://purl.archive.org/purl/yelp/yelp_entities#business_id/8O35ji_yOMVJmZ6bl96yhQ,29616
6,https://purl.archive.org/purl/yelp/yelp_entities#business_id/VQcCL9PiNL_wkGf-uF3fjg,28944
7,https://purl.archive.org/purl/yelp/yelp_entities#business_id/ac1AeYqs8Z4_e2X5M3if2A,21554
8,https://purl.archive.org/purl/yelp/yelp_entities#business_id/QTbahs-GVuWYL5yfdjH34A,21499
9,https://purl.archive.org/purl/yelp/yelp_entities#business_id/ytynqOUb3hjKeJfRj5Tshw,18663


In [22]:
sparql_query = """
SELECT AVG(?outdegree) as ?avgOutdegree
WHERE{
  SELECT ?s COUNT(DISTINCT ?o) as ?outdegree
  FROM <http://www.yelpkg.com/yelp_kg>
  WHERE {
    ?s ?p ?o .
  }
  GROUP BY ?s
}
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,avgOutdegree.value
0,19.899062049130848


In [13]:
sparql_query = """
SELECT COUNT(DISTINCT ?p) as ?numPredicates COUNT(DISTINCT ?o) as ?numObjects
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
  VALUES ?s {<https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg>}
}
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numPredicates.value,numObjects.value
0,37,40123


In [31]:
sparql_query = """
SELECT ?s ?p COUNT(?o) as ?count
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
  FILTER (?s = <https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg>)
}
GROUP BY ?s ?p
ORDER BY DESC(?count)
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,s.value,p.value,count.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/checkinTime,40092
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/category,9
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/aggregateRating,1
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#WiFi,1
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#hasAmbience,1
5,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#BikeParking,1
6,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#hasBusinessParking,1
7,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#hasGoodForMeal,1
8,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/publicAccess,1
9,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#RestaurantsAttire,1


### Hop Diagram

In [2]:
sparql_query = """
SELECT ?s COUNT(?o1) as ?FirstHop COUNT(?o2) as ?SecondHop
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o1 .
  OPTIONAL { ?o1 ?p2 ?o2 } . 
}
GROUP BY ?s
"""

hops = run_query(sparql_query, as_dataframe=True)
hops

Unnamed: 0,s.value,FirstHop.value,SecondHop.value
0,https://purl.archive.org/purl/yelp/ontology#6O...,4,4
1,https://purl.archive.org/purl/yelp/ontology#op...,4,4
2,https://purl.archive.org/purl/yelp/ontology#KP...,4,4
3,https://purl.archive.org/purl/yelp/yelp_entiti...,19,19
4,Nb4920c3cc8d04c97960c915fe271fc51,2,2
...,...,...,...
1048571,https://purl.archive.org/purl/yelp/yelp_entiti...,21,21
1048572,https://purl.archive.org/purl/yelp/yelp_entiti...,25,25
1048573,https://purl.archive.org/purl/yelp/yelp_entiti...,20,20
1048574,https://purl.archive.org/purl/yelp/yelp_entiti...,19,19


In [None]:
fig = go.Figure()

### Extra queries

In [19]:
sparql_query = """
SELECT ?p (COUNT(DISTINCT ?s) as ?numSubjects) (COUNT(?o) as ?numObjects)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
GROUP BY ?p
"""
num_sub_obj_group_pred = run_query(sparql_query, as_dataframe=True)

In [None]:
?p1/?p2/?p3

In [30]:
pd.set_option('display.max_rows', 500)
num_sub_obj_group_pred.sort_values(by=['p.value'], ascending=False)

Unnamed: 0,p.value,numSubjects.value,numObjects.value
37,https://www.w3.org/2004/02/skos/core#narrowMatch,41,55
115,https://www.w3.org/2004/02/skos/core#exactMatch,1304,1580
25,https://schema.org/url,9128523,9128523
108,https://schema.org/reviewCount,2138243,2138243
83,https://schema.org/publicAccess,150346,150346
92,https://schema.org/postalCode,150346,150346
102,https://schema.org/longitude,150346,150346
94,https://schema.org/location,141799,141799
112,https://schema.org/legalName,2138211,2138211
84,https://schema.org/latitude,150346,150346


In [8]:
sparql_query = """
SELECT ?class (COUNT(DISTINCT ?s) as ?numSubjects)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s rdfs:Class ?class .
}
GROUP BY ?class
"""
num_sub_group_class = run_query(sparql_query, as_dataframe=True)

In [22]:
pd.set_option('display.max_colwidth', 500)
num_sub_group_class.sort_values(by=['class.value'], ascending=False)

Unnamed: 0,class.value,numSubjects.value
5,https://schema.org/UserReview,6990280
1,https://schema.org/Person,1987897
3,https://schema.org/ParkingFacility,88814
8,https://schema.org/OpeningHoursSpecification,127123
11,https://schema.org/LocationFeatureSpecification,57962
4,https://schema.org/LocalBusiness,150346
9,https://schema.org/FoodService,28784
2,https://purl.archive.org/purl/yelp/ontology#yelpCategory,1318
7,https://purl.archive.org/purl/yelp/ontology#schemaCategory,229
6,https://purl.archive.org/purl/yelp/ontology#datasetCategory,1311


In [7]:
sparql_query = """
SELECT ?p ?class (COUNT(DISTINCT ?s) as ?numSubjects)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s rdfs:Class ?class .
  ?s ?p ?o .
}
GROUP BY ?p ?class
"""
num_sub_group_pred_class = run_query(sparql_query, as_dataframe=True)

In [34]:
num_sub_group_pred_class.sort_values(by=['class.value'], ascending=False)

Unnamed: 0,p.value,class.value,numSubjects.value
40,https://schema.org/dateCreated,https://schema.org/UserReview,6990280
34,https://schema.org/about,https://schema.org/UserReview,6990280
17,https://schema.org/author,https://schema.org/UserReview,6990280
13,https://schema.org/url,https://schema.org/UserReview,6990280
14,http://www.w3.org/2000/01/rdf-schema#Class,https://schema.org/UserReview,6990280
3,https://schema.org/description,https://schema.org/UserReview,6990280
4,https://schema.org/aggregateRating,https://schema.org/UserReview,6990280
18,https://schema.org/url,https://schema.org/Person,1987897
15,https://schema.org/dateCreated,https://schema.org/Person,1987897
21,http://www.w3.org/2000/01/rdf-schema#Class,https://schema.org/Person,1987897


In [37]:
sparql_query = """
SELECT (COUNT(DISTINCT ?s) as ?numSubjects) (COUNT(DISTINCT ?checkin) as ?numcheckins)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s schema:checkinTime ?checkin .
}
"""
checkin = run_query(sparql_query, as_dataframe=True)

In [38]:
checkin

Unnamed: 0,numSubjects.value,numcheckins.value
0,131930,12887702


In [15]:
sparql_query = """
SELECT DISTINCT ?p ?category
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
   ?category rdfs:Class <https://purl.archive.org/purl/yelp/ontology#yelpCategory> .
   ?var ?p ?category .
}
"""
categories = run_query(sparql_query, as_dataframe=True)

In [16]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 500)
categories

Unnamed: 0,p.value,category.value
0,https://www.w3.org/2004/02/skos/core#narrowMatch,https://purl.archive.org/purl/yelp/business_categories#Himalayan
1,https://www.w3.org/2004/02/skos/core#narrowMatch,https://purl.archive.org/purl/yelp/business_categories#Hypnosis
2,https://www.w3.org/2004/02/skos/core#narrowMatch,https://purl.archive.org/purl/yelp/business_categories#Hypnotherapy
3,https://www.w3.org/2004/02/skos/core#narrowMatch,https://purl.archive.org/purl/yelp/business_categories#Nepalese
4,https://www.w3.org/2004/02/skos/core#narrowMatch,https://purl.archive.org/purl/yelp/business_categories#Cajun
...,...,...
1313,https://www.w3.org/2004/02/skos/core#exactMatch,https://purl.archive.org/purl/yelp/business_categories#fencing_club
1314,https://www.w3.org/2004/02/skos/core#exactMatch,https://purl.archive.org/purl/yelp/business_categories#dialysis_clinic
1315,https://www.w3.org/2004/02/skos/core#exactMatch,https://purl.archive.org/purl/yelp/business_categories#hospitalist
1316,https://www.w3.org/2004/02/skos/core#exactMatch,https://purl.archive.org/purl/yelp/business_categories#natural_gas_supplier


In [18]:
sparql_query = """
SELECT DISTINCT ?category
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
   ?category rdfs:Class <https://purl.archive.org/purl/yelp/ontology#schemaCategory> .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,category.value
0,https://schema.org/LocalBusiness
1,https://schema.org/MovieTheater
2,https://schema.org/InfectiousDisease
3,https://schema.org/ProductModel
4,https://schema.org/FireStation
5,https://schema.org/Apartment
6,https://schema.org/AutomotiveBusiness
7,https://schema.org/BowlingAlley
8,https://schema.org/Campground
9,https://schema.org/Courthouse
