In [58]:
import pandas as pd
import numpy as np

from UtilityFunctions.get_data_path import get_path
from UtilityFunctions.flatten_dict import flatten_dictionary
from UtilityFunctions.run_query import run_query

In [15]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import RDFS, XSD
import gzip

triple_file = gzip.open(filename=f"test.nt.gz", mode="at", encoding="utf-8")

G = Graph()
G.add(triple=(URIRef("Hello"), URIRef("hasRating"), Literal(1.5, datatype=XSD.float)))
G.add(triple=(URIRef("Hello"), URIRef("hasRating"), Literal(1.5, datatype=XSD.decimal)))

triple_file.write(G.serialize(format='nt'))
triple_file.close()

In [None]:
businesses = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
users = pd.read_json(get_path("yelp_academic_dataset_user.json"), lines=True)

### How many businesses have been reviewed?

In [2]:
sparql_query = """

SELECT COUNT(?business)
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.type,callret-0.datatype,callret-0.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#integer,150346


In [3]:
len(businesses["business_id"].unique())

150346

### How many businesses have, on average, a rating of 4.5?

In [148]:
sparql_query = """

SELECT ?business ?rating STR(?rating)
FROM <http://www.yelpkg.com/business>
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
    ?business schema:starRating ?rating .
}
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.type,business.value,rating.type,rating.datatype,rating.value,callret-2.type,callret-2.value
0,uri,https://example.org/business_id/-a7VXX0-V9LgWMFrq90iNA,typed-literal,http://www.w3.org/2001/XMLSchema#float,3.5,literal,3.5
1,uri,https://example.org/business_id/0UqeZTDBdV0uY3wesbLvYQ,typed-literal,http://www.w3.org/2001/XMLSchema#float,4.0,literal,4.0
2,uri,https://example.org/business_id/1jx1sfgjgVg0nM6n3p0xWA,typed-literal,http://www.w3.org/2001/XMLSchema#float,4.5,literal,4.5
3,uri,https://example.org/business_id/2MAQeAqmD8enCT2ZYqUgIQ,typed-literal,http://www.w3.org/2001/XMLSchema#float,4.0,literal,4.0
4,uri,https://example.org/business_id/2O2K6SXPWv56amqxCECd4w,typed-literal,http://www.w3.org/2001/XMLSchema#float,4.5,literal,4.5
5,uri,https://example.org/business_id/5Z8iBpJMmOMz6G_7oVnzRA,typed-literal,http://www.w3.org/2001/XMLSchema#float,2.5,literal,2.5
6,uri,https://example.org/business_id/7omkeqEv-kKMIn9kmOR6Lw,typed-literal,http://www.w3.org/2001/XMLSchema#float,5.0,literal,5.0
7,uri,https://example.org/business_id/7xc84taj12pt-RtZhAfSkQ,typed-literal,http://www.w3.org/2001/XMLSchema#float,5.0,literal,5.0
8,uri,https://example.org/business_id/8n93L-ilMAsvwUatarykSg,typed-literal,http://www.w3.org/2001/XMLSchema#float,3.0,literal,3.0
9,uri,https://example.org/business_id/9RGR4_r4PJLTooNmscUE_A,typed-literal,http://www.w3.org/2001/XMLSchema#float,4.0,literal,4.0


In [19]:
sparql_query = """

select ?x, (str(?x) as ?sx) {
  values ?x { 
    "1.11111"^^xsd:decimal
    "1.11115"^^xsd:decimal
    "1.11119"^^xsd:decimal
  }
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,x.value,sx.value
0,1.11111,1.11111
1,1.11115,1.11115
2,1.11119,1.11119


In [24]:
sparql_query = """

SELECT ?rating
FROM <http://www.yelpkg.com/business>
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
    ?business schema:starRating ?rating .
    FILTER (?rating > 0 && ?rating <= 5)
}
GROUP BY xsd:decimal(?rating)
ORDER BY ?rating
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

QueryBadFormed: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'Virtuoso 37000 Error SP031: SPARQL compiler: The XPATH function http://www.w3.org/2001/XMLSchema#decimal() can handle only 1 arguments but the call provides 2\n\nSPARQL query:\ndefine sql:big-data-const 0\n#output-format:application/sparql-results+json\n\n    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n    PREFIX schema: <https://schema.org/> \n    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> \n    \n\nSELECT ?rating\nFROM <http://www.yelpkg.com/business>\nWHERE {\n    ?business rdfs:Class schema:LocalBusiness .\n    ?business schema:starRating ?rating .\n    FILTER (?rating > 0 && ?rating <= 5)\n}\nGROUP BY xsd:decimal(?rating, 2)\nORDER BY ?rating\nLIMIT 10\n\n'

In [95]:
sparql_query = """

SELECT ROUND(?rating * 10) / 10 AS ?round_rating COUNT(?rating) AS ?count_businesses
FROM <http://www.yelpkg.com/business>
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
    ?business schema:starRating ?rating .
}
GROUP BY (ROUND(?rating * 10) / 10)
ORDER BY (ROUND(?rating * 10) / 10)

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,round_rating.type,round_rating.datatype,round_rating.value,count_businesses.type,count_businesses.datatype,count_businesses.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#float,1.0,typed-literal,http://www.w3.org/2001/XMLSchema#integer,1986
1,typed-literal,http://www.w3.org/2001/XMLSchema#float,1.5,typed-literal,http://www.w3.org/2001/XMLSchema#integer,4932
2,typed-literal,http://www.w3.org/2001/XMLSchema#float,2.0,typed-literal,http://www.w3.org/2001/XMLSchema#integer,9527
3,typed-literal,http://www.w3.org/2001/XMLSchema#float,2.5,typed-literal,http://www.w3.org/2001/XMLSchema#integer,14316
4,typed-literal,http://www.w3.org/2001/XMLSchema#float,3.0,typed-literal,http://www.w3.org/2001/XMLSchema#integer,18453
5,typed-literal,http://www.w3.org/2001/XMLSchema#float,3.5,typed-literal,http://www.w3.org/2001/XMLSchema#integer,26519
6,typed-literal,http://www.w3.org/2001/XMLSchema#float,4.0,typed-literal,http://www.w3.org/2001/XMLSchema#integer,31125
7,typed-literal,http://www.w3.org/2001/XMLSchema#float,4.5,typed-literal,http://www.w3.org/2001/XMLSchema#integer,27181
8,typed-literal,http://www.w3.org/2001/XMLSchema#float,5.0,typed-literal,http://www.w3.org/2001/XMLSchema#integer,16307


In [47]:
businesses.groupby('stars')['stars'].count()

stars
1.0     1986
1.5     4932
2.0     9527
2.5    14316
3.0    18453
3.5    26519
4.0    31125
4.5    27181
5.0    16307
Name: stars, dtype: int64

### What is the average rating across businesses?

In [11]:
sparql_query = """

SELECT AVG(?rating) as ?averagerating
FROM <http://www.yelpkg.com/business>
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
    ?business schema:starRating ?rating .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,averagerating.type,averagerating.datatype,averagerating.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#decimal,3.59672355766033


In [12]:
businesses['stars'].mean()

3.5967235576603303

### How many businesses have been reviewed in New York, NY?

In [129]:
sparql_query = """

SELECT ?state ?city COUNT(?business) AS ?count_business
FROM <http://www.yelpkg.com/business>
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
    ?business schema:addressRegion ?state .
    ?business schema:location ?city .
}
GROUP BY ?state ?city
ORDER BY DESC(?count_business)
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)[["state.value", "city.value", "count_business.value"]]

Unnamed: 0,state.value,city.value,count_business.value
0,PA,Philadelphia,14567
1,AZ,Tucson,9249
2,FL,Tampa,9048
3,IN,Indianapolis,7540
4,TN,Nashville,6968
5,LA,New Orleans,6208
6,NV,Reno,5932
7,AB,Edmonton,5054
8,MO,Saint Louis,4827
9,CA,Santa Barbara,3829


In [126]:
businesses.groupby(["state", "city"])["business_id"].count().sort_values(ascending=False)[0:10]

state  city         
PA     Philadelphia     14567
AZ     Tucson            9249
FL     Tampa             9048
IN     Indianapolis      7540
TN     Nashville         6968
LA     New Orleans       6208
NV     Reno              5932
AB     Edmonton          5054
MO     Saint Louis       4827
CA     Santa Barbara     3829
Name: business_id, dtype: int64

### How many restaurants have been reviewed?

In [145]:
sparql_query = """

SELECT ?category COUNT(?business) AS ?count_business
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
    ?business schema:category ?category .
}
GROUP BY ?category
ORDER BY DESC(?count_business)
LIMIT 5
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,category.type,category.datatype,category.value,count_business.type,count_business.datatype,count_business.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#string,Restaurants,typed-literal,http://www.w3.org/2001/XMLSchema#integer,52268
1,typed-literal,http://www.w3.org/2001/XMLSchema#string,Food,typed-literal,http://www.w3.org/2001/XMLSchema#integer,27781
2,typed-literal,http://www.w3.org/2001/XMLSchema#string,Shopping,typed-literal,http://www.w3.org/2001/XMLSchema#integer,24395
3,typed-literal,http://www.w3.org/2001/XMLSchema#string,Home Services,typed-literal,http://www.w3.org/2001/XMLSchema#integer,14356
4,typed-literal,http://www.w3.org/2001/XMLSchema#string,Beauty & Spas,typed-literal,http://www.w3.org/2001/XMLSchema#integer,14292


In [146]:
def str_split(string):
    if isinstance(string, str):
        return string.split(", ")
    else:
        return string


businesses["categories"] = businesses["categories"].apply(str_split)

category_biz_count = {}
for i in range(len(businesses)):
    if businesses["categories"][i]:
        for category in businesses["categories"][i]:
            category_biz_count[category] = category_biz_count.get(category, 0) + 1

category_biz_count["Beauty & Spas"]

14292

### During which hours of Monday is business X most visited?

### How many people have written a review on Yelp?

In [37]:
sparql_query = """

SELECT COUNT(?users)
FROM <http://www.yelpkg.com/user>
WHERE {
    ?user rdfs:Class schema:Person .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.type,callret-0.datatype,callret-0.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#integer,0


In [38]:
len(users["user_id"].unique())

1987897

### How many friends does a user have on average?

### How many reviews does a user make on average?