In [2]:
import pandas as pd
import numpy as np
import json
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.run_query import run_query
from Code.UtilityFunctions.dictionary_functions import flatten_dictionary

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

___
___
___
# **DO NOT RUN ALL LINES BELOW. SOME YELP ENTITIES ARE VERY LARGE, AND YOU MIGHT RUN OUT OF RAM.**

In [4]:
business = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
business['categories'] = business['categories'].str.split(', ', expand=False)

In [None]:
users = pd.read_json(get_path("yelp_academic_dataset_user.json"), lines=True)

In [2]:
reviews = pd.read_json(get_path("yelp_academic_dataset_review.json"), lines=True)
#reviews = pd.merge(left=reviews, right=business, how='left', left_on='business_id', right_on='business_id')
#reviews = pd.merge(left=reviews, right=users, how='left', left_on='user_id', right_on='user_id')

In [None]:
reviews.head(10)

In [None]:
checkins = pd.read_json(get_path("yelp_academic_dataset_checkin.json"), lines=True)
checkins['date'] = checkins['date'].str.split(', ', expand=False)
checkins = checkins.explode('date')

In [3]:
tips = pd.read_json(get_path("yelp_academic_dataset_tip.json"), lines=True)

___
___
___
### How many different types of businesses are defined in Yelp?

In [2]:
sparql_query ="""

SELECT COUNT(DISTINCT(?category))
WHERE {
    ?business schema:category ?category .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,1311


In [5]:
unique_categories = {category for sublist in business['categories'] if sublist for category in sublist}
len(unique_categories)

1311

### How many businesses of type "Restaurants" exist?

In [4]:
sparql_query = """

SELECT COUNT(DISTINCT(?business)) AS ?numberRestaurants
WHERE {
    ?business schema:category yelpcat:Restaurants .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,numberRestaurants.value
0,52268


In [7]:
counter = 0
for row in business['categories']:
    if row is not None and "Restaurants" in row:
        counter += 1
print(counter)

52268


### How many businesses of type Restaurants have been reviewed?

In [6]:
sparql_query = """
SELECT ?business
WHERE {
    ?review schema:about ?business .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value
0,https://purl.archive.org/purl/yelp/yelp_entities#-02xFuruu85XmDn2xiynJw
1,https://purl.archive.org/purl/yelp/yelp_entities#-02xFuruu85XmDn2xiynJw
2,https://purl.archive.org/purl/yelp/yelp_entities#-02xFuruu85XmDn2xiynJw
3,https://purl.archive.org/purl/yelp/yelp_entities#-02xFuruu85XmDn2xiynJw
4,https://purl.archive.org/purl/yelp/yelp_entities#-02xFuruu85XmDn2xiynJw
...,...
1048571,https://purl.archive.org/purl/yelp/yelp_entities#ar-SQg2423FSVxRcaxorVg
1048572,https://purl.archive.org/purl/yelp/yelp_entities#ar-SQg2423FSVxRcaxorVg
1048573,https://purl.archive.org/purl/yelp/yelp_entities#ar-SQg2423FSVxRcaxorVg
1048574,https://purl.archive.org/purl/yelp/yelp_entities#ar-SQg2423FSVxRcaxorVg


In [None]:
sparql_query = """

SELECT ?business
WHERE {
    ?business schema:category yelpcat:Restaurants .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-0jzoPt3UeXn6FUXVQvyPg
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-1MhPXk1FglglUAmuPLIGg
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-3IOd5YntpkbK6RwT3HYtA
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-6OjnX3ZdDOhHxWR60wysg
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-ATiAtTikuGuqvaW2O6tNA
...,...
52263,https://purl.archive.org/purl/yelp/yelp_entities#business_id/znqFzYiKDdjwR13maABX1A
52264,https://purl.archive.org/purl/yelp/yelp_entities#business_id/zrQAj03aHI7kpmAiyKcKhA
52265,https://purl.archive.org/purl/yelp/yelp_entities#business_id/zs9xTZA8D-PQHcEp1FwIkg
52266,https://purl.archive.org/purl/yelp/yelp_entities#business_id/zvVDXVV9Ib8ZjYhPCl0r4Q


In [7]:
review_unique_business = reviews.drop_duplicates(subset=['business_id'])
mask = review_unique_business['categories'].apply(lambda x: x is not None and "Restaurants" in x)
len(review_unique_business[mask])

52268

### How many businesses have been reviewed?

In [7]:
sparql_query = """

SELECT COUNT(DISTINCT(?business))
WHERE {
    ?review schema:about ?business .
    ?review rdfs:Class schema:UserReview .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,150346


In [32]:
review_unique_business = reviews.drop_duplicates(subset=['business_id'])
len(review_unique_business)

150346

### How many businesses have, on average, a rating of 4.5?

In [23]:
sparql_query = """
SELECT COUNT(DISTINCT(?business))
WHERE {
    ?business schema:aggregateRating ?rating .
    FILTER (?rating = 4.5) .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,27181


In [49]:
business[business['stars'] == 4.5].groupby('stars')['stars'].count()

stars
4.5    27181
Name: stars, dtype: int64

### What is the average rating across businesses?

In [11]:
sparql_query = """
SELECT AVG(?rating) as ?averagerating
WHERE {
    ?business schema:aggregateRating ?rating .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,averagerating.type,averagerating.datatype,averagerating.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#decimal,3.59672355766033


In [25]:
business['stars'].mean()

3.5967235576603303

### How many businesses have been reviewed in Santa Barbara, CA?

TO DO: RUN AGAIN

In [3]:
sparql_query = """
SELECT ?state ?city COUNT(?business) AS ?count_business
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
    ?business schema:location ?city .
    ?city rdfs:label ?city_name .
    ?city yelpont:location ?state .
    VALUES ?city_name{'Santa Barbara'} .
}
GROUP BY ?state ?city
ORDER BY DESC(?count_business)
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

Empty resultset


In [31]:
business_santa_barbara = business[business['city'] == "Santa Barbara"]
len(business_santa_barbara)

3829

### What are the five most busy days, and for what business?

In [6]:
sparql_query = """
SELECT ?business ?year ?month ?day COUNT(?visit) as ?numberOfVisits
WHERE {
    ?business schema:checkinTime ?visit .
    BIND (day(?visit)  as ?day)
    BIND (month(?visit) as ?month)
    BIND (year(?visit) as ?year)
}
GROUP BY ?business ?year ?month ?day
ORDER BY DESC(COUNT(?visit))
LIMIT 5

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value,year.value,month.value,day.value,numberOfVisits.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/CySqUcNz8oPiQTu4EXTnig,2016,6,25,457
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/g50ImmCX3WY3koEDIzoKxg,2015,8,30,285
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/qfWWx0dVo1UuAhRfh03Dyw,2016,8,28,268
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/g50ImmCX3WY3koEDIzoKxg,2016,8,28,263
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FBBeJO50xZiNIo3oFhAFRA,2017,7,29,251


In [17]:
checkins["Day"] = checkins["date"].apply(lambda x: x.split("-")[2][:2])
checkins["Month"] = checkins["date"].apply(lambda x: x.split("-")[1])
checkins["Year"] = checkins["date"].apply(lambda x: x.split("-")[0])

checkins.value_counts(subset=["business_id", "Day", "Month", "Year"], sort=True, ascending=False).head(5)

business_id             Day  Month  Year
CySqUcNz8oPiQTu4EXTnig  25   06     2016    465
g50ImmCX3WY3koEDIzoKxg  30   08     2015    287
qfWWx0dVo1UuAhRfh03Dyw  28   08     2016    270
g50ImmCX3WY3koEDIzoKxg  28   08     2016    264
FBBeJO50xZiNIo3oFhAFRA  29   07     2017    254
dtype: int64

### Which are the top 10 most visisted businesses?

In [8]:
sparql_query = """
SELECT ?business COUNT(?visit) AS ?count_visits
WHERE {
    ?business schema:checkinTime ?visit .
}
GROUP BY ?business 
ORDER BY DESC(COUNT(?visit))
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value,count_visits.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-QI8Qi8XWH3D8y8ethnajA,52129
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,40092
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/Eb1XmmLWyt_way5NNZ7-Pw,37553
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/c_4c5rJECZSfNgFj7frwHQ,37511
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/4i4kmYm9wgSNyF1b6gKphg,31163
5,https://purl.archive.org/purl/yelp/yelp_entities#business_id/8O35ji_yOMVJmZ6bl96yhQ,29599
6,https://purl.archive.org/purl/yelp/yelp_entities#business_id/VQcCL9PiNL_wkGf-uF3fjg,28917
7,https://purl.archive.org/purl/yelp/yelp_entities#business_id/ac1AeYqs8Z4_e2X5M3if2A,21527
8,https://purl.archive.org/purl/yelp/yelp_entities#business_id/QTbahs-GVuWYL5yfdjH34A,21478
9,https://purl.archive.org/purl/yelp/yelp_entities#business_id/ytynqOUb3hjKeJfRj5Tshw,18604


In [20]:
checkins.value_counts(subset=["business_id"], sort=True, ascending=False).head(10)

business_id           
-QI8Qi8XWH3D8y8ethnajA    52144
FEXhWNCMkv22qG04E83Qjg    40109
Eb1XmmLWyt_way5NNZ7-Pw    37562
c_4c5rJECZSfNgFj7frwHQ    37518
4i4kmYm9wgSNyF1b6gKphg    31168
8O35ji_yOMVJmZ6bl96yhQ    29606
VQcCL9PiNL_wkGf-uF3fjg    28927
ac1AeYqs8Z4_e2X5M3if2A    21542
QTbahs-GVuWYL5yfdjH34A    21487
ytynqOUb3hjKeJfRj5Tshw    18615
dtype: int64

### How many people have written a review or tip on Yelp?
TO DO: Not correct result, may be because reviews DF can contain "..." reviews which are NOT added to the KG.

In [7]:
sparql_query = """
SELECT COUNT(DISTINCT(?user)) AS ?countUsers
WHERE {
    ?user rdfs:Class schema:Person .
    ?review schema:author ?user .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,countUsers.value
0,1987897


In [7]:
reviews.drop_duplicates(subset=['user_id']).shape[0]

1987929

### How many users have 10 friends?

In [5]:
sparql_query = """
SELECT COUNT(*) as ?usersWith10Friends
WHERE {
    SELECT ?user COUNT(?friend) AS ?countUsers
    WHERE {
        ?user rdfs:Class schema:Person .
        ?user schema:knows ?friend .
    }
    GROUP BY ?user
    HAVING (COUNT(?friend) = 10)
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,13579


In [11]:
users["amountFriends"] = users["friends"].apply(lambda x: len(x.split(",")))
users[users["amountFriends"] == 10].shape[0]

13579

### How many friends does a user have on average?
TO DO: Is not giving correct result

In [73]:
sparql_query = """
SELECT AVG(?numberOfFriends) as ?averageFriends
WHERE { 
    SELECT COUNT(?friend) as ?numberOfFriends
    WHERE {
        ?user rdfs:Class schema:Person .
        ?user schema:knows ?friend .
    }
    GROUP BY ?user
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,averageFriends.value
0,94.85361104650848


In [5]:
sparql_query = """
SELECT (?numberOfFriends / ?numberOfUsers) as ?averageFriends
WHERE {
    SELECT COUNT(?friend) as ?numberOfFriends 
    WHERE {
        ?user rdfs:Class schema:Person .
        ?user schema:knows ?friend .
    }
    SELECT COUNT(?user) as ?numberOfUsers
    WHERE {
        ?user rdfs:Class schema:Person .
    }
}
"""

run_query(query=sparql_query, as_dataframe=True)

QueryBadFormed: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b"Virtuoso 37000 Error SP030: SPARQL compiler, line 17: syntax error at 'SELECT' before '('\n\nSPARQL query:\n#output-format:application/sparql-results+json\n\n    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n    PREFIX schema: <https://schema.org/> \n    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> \n    PREFIX yelpcat: <https://purl.archive.org/purl/yelp/business_categories#>\n    PREFIX yelpont: <https://purl.archive.org/purl/yelp/ontology#>\n    PREFIX yelpent: <https://purl.archive.org/purl/yelp/yelp_entities#>\n    \nSELECT (?numberOfFriends / ?numberOfUsers) as ?averageFriends\nWHERE {\n    SELECT COUNT(?friend) as ?numberOfFriends \n    WHERE {\n        ?user rdfs:Class schema:Person .\n        ?user schema:knows ?friend .\n    }\n    SELECT COUNT(?user) as ?numberOfUsers\n    WHERE {\n        ?user rdfs:Class schema:Person .\n    }\n}\n"

In [26]:
sparql_query = """
SELECT ?numberOfFriends ?numberOfUsers
WHERE {
    SELECT COUNT(?friend) as ?numberOfFriends COUNT(?user) as ?numberOfUsers
    WHERE {
        ?user rdfs:Class schema:Person .
        ?user schema:knows ?friend .
        }
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,numberOfFriends.value,numberOfUsers.value
0,105225474,105225474


In [19]:

sparql_query = """
SELECT ?user (COUNT(?friend) AS ?friendCount)
WHERE {
    ?user schema:knows ?friend .
   }
   GROUP BY ?user

"""

x = run_query(query=sparql_query, as_dataframe=True)

In [21]:
x.sort_values(by=['friendCount.value'], ascending=False).head(100)

Unnamed: 0,user.value,friendCount.value
1024615,https://purl.archive.org/purl/yelp/yelp_entities#user_id/KXzNTIlLiqmnUa4_f4Oa4w,999
334198,https://purl.archive.org/purl/yelp/yelp_entities#user_id/JQIZhoQB-Tg9hdBEj90m4g,999
209148,https://purl.archive.org/purl/yelp/yelp_entities#user_id/DxsKifTKSc38ImA-ZYrd9A,999
452996,https://purl.archive.org/purl/yelp/yelp_entities#user_id/EbYTXKL-8CrJTh3HjkxBvA,999
331566,https://purl.archive.org/purl/yelp/yelp_entities#user_id/Qrtz_s7Z3vW_GCKDJVHVqw,999
25536,https://purl.archive.org/purl/yelp/yelp_entities#user_id/LJpp7toFiaHVrj811FSy5w,999
70684,https://purl.archive.org/purl/yelp/yelp_entities#user_id/sxEpkRflD3PxZhBgmcCyow,999
455998,https://purl.archive.org/purl/yelp/yelp_entities#user_id/0sl4I8R7znWXp_K4A0nZ_w,999
296214,https://purl.archive.org/purl/yelp/yelp_entities#user_id/FyaI7g6aIMBYeMQHLGqPXg,999
407598,https://purl.archive.org/purl/yelp/yelp_entities#user_id/tEy1MNP7tHJlZgP7xqF4yA,998


In [18]:
sparql_query="""SELECT (AVG(?friendCount) AS ?averageFriends)
WHERE {
  ?user schema:knows ?friend .
  {
    SELECT ?user (COUNT(?friend) AS ?friendCount)
    WHERE {
      ?user schema:knows ?friend .
    }
    GROUP BY ?user
  }
}"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,averageFriends.value
0,458.95945171983726


In [12]:
# How many friends does a user have on average?
users["amountFriends"].mean()

53.375011381374385

### How many users have authored 10 reviews?

In [84]:
sparql_query = """
SELECT COUNT(DISTINCT(?user)) AS ?countUsers
WHERE {
    SELECT ?user COUNT(?review) as ?numberOfReviews
    WHERE {
        ?user rdfs:Class schema:Person .
        ?review schema:author ?user .
    }
    GROUP BY ?user
    HAVING (COUNT(?review) = 10)
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,countUsers.value
0,14119


In [17]:
# Count how many users have authored 10 reviews
reviews.groupby("user_id").size().reset_index(name="count").query("count == 10").shape[0]

14119

### How many reviews did users make in May 2018?

In [12]:
sparql_query = """
SELECT ?year ?month COUNT(?review) as ?countReviews
WHERE {
    ?review rdfs:Class schema:UserReview .
    ?review schema:dateCreated ?date .
    BIND (month(?date) as ?month) .
    BIND (year(?date) as ?year) .
    VALUES ?year {2018}
    VALUES ?month {5}
}
GROUP BY ?year ?month
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,year.value,month.value,countReviews.value
0,2018,5,79434


In [3]:
reviews['YEAR'] = reviews.date.dt.year
reviews['MONTH'] = reviews.date.dt.month
reviews.query("YEAR == 2018 & MONTH == 5").shape[0]

79434