In [1]:
import pandas as pd
import numpy as np

from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.run_query import run_query

___
___
___
# **DO NOT RUN ALL LINES BELOW. SOME YELP ENTITIES ARE VERY LARGE, AND YOU MIGHT RUN OUT OF RAM.**

In [14]:
business = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
business['categories'] = business['categories'].str.split(', ', expand=False)

In [3]:
users = pd.read_json(get_path("yelp_academic_dataset_user.json"), lines=True)

In [15]:
reviews = pd.read_json(get_path("yelp_academic_dataset_review.json"), lines=True)
reviews = pd.merge(left=reviews, right=business, how='left', left_on='business_id', right_on='business_id')
#reviews = pd.merge(left=reviews, right=users, how='left', left_on='user_id', right_on='user_id')

In [None]:
checkins = pd.read_json(get_path("yelp_academic_dataset_checkin.json"), lines=True)
checkins['date'] = checkins['date'].str.split(', ', expand=False)
checkins = checkins.explode('date')

In [3]:
tips = pd.read_json(get_path("yelp_academic_dataset_tip.json"), lines=True)

___
___
___
### How many different types of businesses are defined in Yelp?

In [2]:
sparql_query ="""

SELECT COUNT(DISTINCT(?category))
WHERE {
    ?business schema:category ?category .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,1311


In [5]:
unique_categories = {category for sublist in business['categories'] if sublist for category in sublist}
len(unique_categories)

1311

### How many businesses of type "Restaurants" exist?

In [3]:
sparql_query = """

SELECT COUNT(DISTINCT(?business)) AS ?numberRestaurants
WHERE {
    ?business schema:category yelpcat:Restaurants .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,numberRestaurants.value
0,52268


In [7]:
counter = 0
for row in business['categories']:
    if row is not None and "Restaurants" in row:
        counter += 1
print(counter)

52268


### How many businesses of type Restaurants have been reviewed?

In [12]:
sparql_query = """

SELECT COUNT(DISTINCT(?business))
WHERE {
    ?review schema:about ?business .
    ?business schema:category yelpcat:Restaurants .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,0


In [None]:
sparql_query = """

SELECT ?o
WHERE {
    ?review schema:about ?o .
}

"""

run_query(query=sparql_query, as_dataframe=True)

In [13]:
review_unique_business = reviews.drop_duplicates(subset=['business_id'])
mask = review_unique_business['categories'].apply(lambda x: "Restaurants" in x)
len(review_unique_business[mask])

NameError: name 'reviews' is not defined

### How many businesses have been reviewed?

In [3]:
sparql_query = """

SELECT COUNT(DISTINCT(?business))
WHERE {
    ?review schema:about ?business .
    ?review rdfs:Class schema:UserReview .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,150346


In [32]:
review_unique_business = reviews.drop_duplicates(subset=['business_id'])
len(review_unique_business)

150346

### How many businesses have, on average, a rating of 4.5?

In [23]:
sparql_query = """
SELECT COUNT(DISTINCT(?business))
WHERE {
    ?business schema:aggregateRating ?rating .
    FILTER (?rating = 4.5) .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,27181


In [24]:
business.groupby('stars')['stars'].count()

stars
1.0     1986
1.5     4932
2.0     9527
2.5    14316
3.0    18453
3.5    26519
4.0    31125
4.5    27181
5.0    16307
Name: stars, dtype: int64

### What is the average rating across businesses?

In [11]:
sparql_query = """
SELECT AVG(?rating) as ?averagerating
WHERE {
    ?business schema:aggregateRating ?rating .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,averagerating.type,averagerating.datatype,averagerating.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#decimal,3.59672355766033


In [25]:
business['stars'].mean()

3.5967235576603303

### How many businesses have been reviewed in Santa Barbara, CA?
Svært, når vi ikke har byer med i Yelp længere.

In [8]:
sparql_query = """
SELECT ?state ?city COUNT(?business) AS ?count_business
WHERE {
    ?business schema:category ?category .
    ?business schema:addressRegion ?state .
    ?business schema:location ?city .
    FILTER(?city = "Santa Barbara")
}
GROUP BY ?state ?city
ORDER BY DESC(?count_business)
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

ConnectionResetError: [Errno 104] Connection reset by peer

In [31]:
business_santa_barbara = business[business['city'] == "Santa Barbara"]
len(business_santa_barbara)

3829

Which business is the most visited on day X?

In [32]:
sparql_query = """
SELECT ?business ?year ?month ?day COUNT(?visit)
WHERE {
    ?business schema:checkinTime ?visit .
    BIND (day(?visit)  as ?day)
    BIND (month(?visit) as ?month)
    BIND (year(?visit) as ?year)
}
GROUP BY ?business ?year ?month ?day
ORDER BY DESC(COUNT(?visit))
LIMIT 5

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value,year.value,month.value,day.value,callret-4.value
0,https://example.org/business_id/CySqUcNz8oPiQTu4EXTnig,2016,6,25,457
1,https://example.org/business_id/g50ImmCX3WY3koEDIzoKxg,2015,8,30,285
2,https://example.org/business_id/qfWWx0dVo1UuAhRfh03Dyw,2016,8,28,268
3,https://example.org/business_id/g50ImmCX3WY3koEDIzoKxg,2016,8,28,263
4,https://example.org/business_id/FBBeJO50xZiNIo3oFhAFRA,2017,7,29,251


In [17]:
checkins["Day"] = checkins["date"].apply(lambda x: x.split("-")[2][:2])
checkins["Month"] = checkins["date"].apply(lambda x: x.split("-")[1])
checkins["Year"] = checkins["date"].apply(lambda x: x.split("-")[0])

checkins.value_counts(subset=["business_id", "Day", "Month", "Year"], sort=True, ascending=False).head(5)

business_id             Day  Month  Year
CySqUcNz8oPiQTu4EXTnig  25   06     2016    465
g50ImmCX3WY3koEDIzoKxg  30   08     2015    287
qfWWx0dVo1UuAhRfh03Dyw  28   08     2016    270
g50ImmCX3WY3koEDIzoKxg  28   08     2016    264
FBBeJO50xZiNIo3oFhAFRA  29   07     2017    254
dtype: int64

Which are the top 10 most visisted businesses?

In [18]:
sparql_query = """
SELECT ?business COUNT(?visit) AS ?count_visits
WHERE {
    ?business schema:checkinTime ?visit .
}
GROUP BY ?business 
ORDER BY DESC(COUNT(?visit))
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value,count_visits.value
0,https://example.org/business_id/-QI8Qi8XWH3D8y8ethnajA,52129
1,https://example.org/business_id/FEXhWNCMkv22qG04E83Qjg,40092
2,https://example.org/business_id/Eb1XmmLWyt_way5NNZ7-Pw,37553
3,https://example.org/business_id/c_4c5rJECZSfNgFj7frwHQ,37511
4,https://example.org/business_id/4i4kmYm9wgSNyF1b6gKphg,31163
5,https://example.org/business_id/8O35ji_yOMVJmZ6bl96yhQ,29599
6,https://example.org/business_id/VQcCL9PiNL_wkGf-uF3fjg,28917
7,https://example.org/business_id/ac1AeYqs8Z4_e2X5M3if2A,21527
8,https://example.org/business_id/QTbahs-GVuWYL5yfdjH34A,21478
9,https://example.org/business_id/ytynqOUb3hjKeJfRj5Tshw,18604


In [20]:
checkins.value_counts(subset=["business_id"], sort=True, ascending=False).head(10)

business_id           
-QI8Qi8XWH3D8y8ethnajA    52144
FEXhWNCMkv22qG04E83Qjg    40109
Eb1XmmLWyt_way5NNZ7-Pw    37562
c_4c5rJECZSfNgFj7frwHQ    37518
4i4kmYm9wgSNyF1b6gKphg    31168
8O35ji_yOMVJmZ6bl96yhQ    29606
VQcCL9PiNL_wkGf-uF3fjg    28927
ac1AeYqs8Z4_e2X5M3if2A    21542
QTbahs-GVuWYL5yfdjH34A    21487
ytynqOUb3hjKeJfRj5Tshw    18615
dtype: int64

How many people have written a review or tip on Yelp? TO DO: review author user, user author tip giver ikke mening

In [25]:
sparql_query = """
SELECT DISTINCT COUNT(?users) AS ?count_users
WHERE {
    ?user rdfs:Class schema:Person .
    ?review schema:author ?user .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,0


In [7]:
reviews.drop_duplicates(subset=['user_id']).shape[0]

1987929

How many users have 10 friends?

In [19]:
sparql_query = """
SELECT ?user (COUNT(?friend) as ?numberOfFriends)
WHERE {
    ?user rdfs:Class schema:Person .
    ?user schema:knows ?friend .
}
HAVING { ?numberOfFriends = 10 }
GROUP BY ?user
"""

run_query(query=sparql_query, as_dataframe=True)

QueryBadFormed: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 14: syntax error at '{' before '?numberOfFriends'\n\nSPARQL query:\ndefine sql:big-data-const 0\n#output-format:application/sparql-results+json\n\n    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n    PREFIX schema: <https://schema.org/> \n    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> \n    PREFIX yelp_category: <https://www.yelp.com/category/>\n    \nSELECT ?user (COUNT(?friend) as ?numberOfFriends)\nWHERE {\n    ?user rdfs:Class schema:Person .\n    ?user schema:knows ?friend .\n}\nHAVING { ?numberOfFriends = 10 }\nGROUP BY ?user\n"

In [11]:
users["amountFriends"] = users["friends"].apply(lambda x: len(x.split(",")))
users[users["amountFriends"] == 10].shape[0]

13579

How many friends does a user have on average?

In [None]:
sparql_query = """
SELECT (AVG(?friend) as ?averageFriends)
WHERE {
    ?user rdfs:Class schema:Person .
    ?user schema:knows ?friend .
}
"""

run_query(query=sparql_query, as_dataframe=True)

In [12]:
# How many friends does a user have on average?
users["amountFriends"].mean()

53.375011381374385

How many users have authored 10 reviews or tips?

In [None]:
sparql_query = """
SELECT ?user (COUNT(?reviews) as ?numberOfReviews)
WHERE {
    ?user rdfs:Class schema:Person .
    ?user schema:knows ?friend .
}
HAVING { ?numberOfFriends = 10 }
GROUP BY ?user
"""

run_query(query=sparql_query, as_dataframe=True)

In [17]:
# Count how many users have authored 10 reviews
reviews.groupby("user_id").size().reset_index(name="count").query("count == 10").shape[0]

14119

How many reviews or tips does a user make on average every month?

In [2]:
sparql_query = """
SELECT ?month (AVG(?review) as ?averageReviews)
WHERE {
    ?user schema:author ?review .
    ?review rdfs:Class schema:UserReview .
    ?review schema:dateCreated ?date .
    BIND (month(?visit) as ?month) .
}
GROUP BY ?month
"""

run_query(query=sparql_query, as_dataframe=True)

URLError: <urlopen error [Errno 111] Connection refused>

In [None]:
reviews