In [1]:
import pandas as pd
import numpy as np
import json
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.run_query import run_query
from Code.UtilityFunctions.dictionary_functions import flatten_dictionary

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

___
___
___
# **DO NOT RUN ALL LINES BELOW. SOME YELP ENTITIES ARE VERY LARGE, AND YOU MIGHT RUN OUT OF RAM.**

In [18]:
business = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
business['categories'] = business['categories'].str.split(', ', expand=False)

In [None]:
users = pd.read_json(get_path("yelp_academic_dataset_user.json"), lines=True)

In [2]:
reviews = pd.read_json(get_path("yelp_academic_dataset_review.json"), lines=True)
#reviews = pd.merge(left=reviews, right=business, how='left', left_on='business_id', right_on='business_id')
#reviews = pd.merge(left=reviews, right=users, how='left', left_on='user_id', right_on='user_id')

In [None]:
reviews.head(10)

In [None]:
checkins = pd.read_json(get_path("yelp_academic_dataset_checkin.json"), lines=True)
checkins['date'] = checkins['date'].str.split(', ', expand=False)
checkins = checkins.explode('date')

In [3]:
tips = pd.read_json(get_path("yelp_academic_dataset_tip.json"), lines=True)

___
___
___
### CQ 1: How many different types of businesses are defined in Yelp?

In [3]:
sparql_query ="""

SELECT COUNT(DISTINCT(?category))
WHERE {
    ?business schema:category ?category .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,1311


In [5]:
unique_categories = {category for sublist in business['categories'] if sublist for category in sublist}
len(unique_categories)

1311

### CQ 2: How many businesses of type "Restaurants" exist?

In [4]:
sparql_query = """

SELECT COUNT(DISTINCT(?business)) AS ?numberRestaurants
WHERE {
    ?business schema:category yelpcat:Restaurants .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,numberRestaurants.value
0,52268


In [7]:
counter = 0
for row in business['categories']:
    if row is not None and "Restaurants" in row:
        counter += 1
print(counter)

52268


### CQ 3: How many businesses of type Restaurants have been reviewed?

In [16]:
sparql_query = """
SELECT COUNT(DISTINCT ?business) AS ?numberRestaurants
WHERE {
    ?review schema:about ?business .
    ?business schema:category yelpcat:Restaurants .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,numberRestaurants.value
0,52268


In [7]:
review_unique_business = reviews.drop_duplicates(subset=['business_id'])
mask = review_unique_business['categories'].apply(lambda x: x is not None and "Restaurants" in x)
len(review_unique_business[mask])

52268

### CQ 4: How many businesses have been reviewed?

In [7]:
sparql_query = """

SELECT COUNT(DISTINCT(?business))
WHERE {
    ?review schema:about ?business .
    ?review rdfs:Class schema:UserReview .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,150346


In [32]:
review_unique_business = reviews.drop_duplicates(subset=['business_id'])
len(review_unique_business)

150346

### CQ 5: How many businesses have, on average, a rating of 4.5?

In [23]:
sparql_query = """
SELECT COUNT(DISTINCT(?business))
WHERE {
    ?business schema:aggregateRating ?rating .
    FILTER (?rating = 4.5) .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,27181


In [49]:
business[business['stars'] == 4.5].groupby('stars')['stars'].count()

stars
4.5    27181
Name: stars, dtype: int64

### CQ 6: What is the average rating across businesses?

In [11]:
sparql_query = """
SELECT AVG(?rating) as ?averagerating
WHERE {
    ?business schema:aggregateRating ?rating .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,averagerating.type,averagerating.datatype,averagerating.value
0,typed-literal,http://www.w3.org/2001/XMLSchema#decimal,3.59672355766033


In [25]:
business['stars'].mean()

3.5967235576603303

### CQ 7: How many businesses have been reviewed in Santa Barbara, CA?

TO DO: RUN AGAIN

In [53]:
sparql_query = """
SELECT ?state ?city ?business
WHERE {
    ?business rdfs:Class schema:LocalBusiness .
}

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/oY6osC4Ch3PvguA7D1hh_w
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/--_9CAxgfXZmoFdNIRrhHA
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-2Axhv9AZ_n7qjQefECpVw
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-3-6BB10tIWNKGEF0Es2BA
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-4dYswJy7SPcbcERvitmIg
...,...
150341,https://purl.archive.org/purl/yelp/yelp_entities#business_id/zvVDXVV9Ib8ZjYhPCl0r4Q
150342,https://purl.archive.org/purl/yelp/yelp_entities#business_id/zwYYxQyUJcx0fOs4WrAn8Q
150343,https://purl.archive.org/purl/yelp/yelp_entities#business_id/zx7XemDyT296vmKnFtZSyQ
150344,https://purl.archive.org/purl/yelp/yelp_entities#business_id/zymV2vdJfHNH63UQ7aUFYw


In [77]:
sparql_query = """
SELECT ?state ?city ?business
WHERE {
    ?business schema:location ?city .
    ?city rdfs:label ?city_name .
    ?city wd:P131 ?state .
    VALUES ?city_name{'Santa Barbara'^^xsd:string} .
}
GROUP BY ?state ?city
ORDER BY DESC(?count_business)
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,state.value,city.value,business.value
0,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#jQuPRiNwqaUaG3q5W0hr6Q
1,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#pQgVk24ZYzhO0Q9J7b8NRw
2,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#r2IhvKZQ_wLR5mLBnPOilg
3,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#r__4cxZAlloPl9p8nwU7yw
4,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#tTc8NfG453HpwgrqJbfxLw
5,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#taB-DZhRI6eLW6XmY9_8yQ
6,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#u28l13rNzuNpyQAv3ypwLg
7,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#vb09pYPJZnzofrmJen9SBw
8,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#wJu6zWSgTjcH0N5g6U6f8A
9,https://www.wikidata.org/entity/Q108106,https://www.wikidata.org/entity/Q159288,https://purl.archive.org/purl/yelp/ontology#-8iATYRnN46Km0_-ldx6cg


In [3]:
sparql_query = """
SELECT ?state ?city COUNT(?business) AS ?count_business 
WHERE {
    ?business schema:location ?city .
    ?city rdfs:label ?city_name .
    ?city wd:P131 ?county .
    ?county wd:P131 ?state .
    ?state rdfs:label ?state_name .
    VALUES ?city_name{'Santa Barbara'^^xsd:string} .
    VALUES ?state_name{'California'^^xsd:string} .
}
GROUP BY ?state ?city
ORDER BY DESC(?count_business)
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,state.value,city.value,count_business.value
0,https://www.wikidata.org/entity/Q99,https://www.wikidata.org/entity/Q159288,3941


In [10]:
sparql_query = """
SELECT ?state ?city ?business 
WHERE {
    ?business schema:location ?city .
    ?city rdfs:label ?city_name .
    ?city wd:P131 ?county .
    ?county wd:P131 ?state .
    ?state rdfs:label ?state_name .
    VALUES ?city_name{'Santa Barbara'^^xsd:string} .
    VALUES ?state_name{'California'^^xsd:string} .
}
GROUP BY ?state ?city
ORDER BY DESC(?count_business)


"""

x = run_query(query=sparql_query, as_dataframe=True)

In [16]:
x['business.value'] = x.apply(lambda x: x['business.value'].replace('https://purl.archive.org/purl/yelp/ontology#', ''), axis=1)
query_bid = x['business.value'].to_list()

In [19]:
business_santa_barbara = business[business['city'] == "Santa Barbara"]
json_bid = business_santa_barbara['business_id']

In [21]:
x.loc[~x['business.value'].isin(json_bid), 'business.value']

43      FdbYPkFZvIF4o5_tQSup5w
92      VEsb_ByMJkuCEDURgfLBDg
132     3KI0WnM1_lnY4QPRtT5O8A
141     SBs30RyeoiEU2LjeiH1YFQ
143     UlBwfWdl3A2na0NIu1JzIA
173     eX7o_-s5TmDT-DMfTV4cmw
205     4IqrY1mXK65HgrrgkpE-cw
225     BSFKoPLQOEpPzoz6gOBlyQ
242     GjxwYoInJcggmLsdrJigfg
282     QEY_524IJ2uqNHCBrmTowQ
284     Qd0seyUMJ53rl-T9mWRSIA
291     SVZyQN2mWMXAYZpyqsOfAA
301     UjRfnT_EuZoz9bmzX38mSg
306     WEY7DI73ljuVu3Ode4VEiA
331     wjOHfRYCwddZ3Ri_AvScaQ
332     xHG-xznai_fgceFDl6xn5w
361     huanJ55zbuaZ9Yv7FiwtgA
402     5-dALZnUy43LJM5d8SAeyw
565     5rz_HnjCKV_F7SXf_7WMNA
599     H-1qpp_77KggOAr9htUrEw
623     Nr-rq1R1l6SLy8nrhEOalw
629     Po6WLHD5ZFkVoiuTFL3Jtg
745     7sOqa_LFDHEtiFzNFZBw4A
775     G1iDxFTY-G6ZzqLE5V2ELA
824     TqCWmvRiOCv4OBSft4Dacw
923     2WHu0k5VVM1Pn9AFzkSBlA
944     7mD4SW5ylpmk_OnAsgWjLw
987     MEVEUCutBAHktfoEuv0ssw
988     Mrou58GInOFO11dJXOXjxg
1036    kCCIDwzaLw7cVghe3Bcz6g
1150    Ol6I7wWJ3x-5v08qCQSRDA
1185    hFYKu5SujPYf_tmnoiKozw
1198    

In [23]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

In [24]:
df = pd.read_csv(get_path("location_mappings.csv"))

data = biz.merge(df, how="left", on=["city", "state"])

In [26]:
data[data['business_id'] == '_YPdYvkOGHjQyCj8K0Xhyw']

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,hours,city_qid,city_label,state_qid,state_label,county_qid,county_label,country_qid,country_label,population
88254,_YPdYvkOGHjQyCj8K0Xhyw,Whodidily Cupcakes,"1150 Coast Village Rd, Ste E",Montecito,CA,93108,34.421606,-119.64629,4.0,60,...,"{'Monday': '10:0-19:0', 'Tuesday': '10:0-19:0', 'Wednesday': '10:0-19:0', 'Thursday': '10:0-19:0', 'Friday': '10:0-20:0', 'Saturday': '10:0-20:0', 'Sunday': '10:0-18:0'}",Q1008912,Montecito,Q99,California,Q108106,Santa Barbara County,Q30,United States of America,8638.0


### CQ 8: What are the five most busy days, and for what business?

In [6]:
sparql_query = """
SELECT ?business ?year ?month ?day COUNT(?visit) as ?numberOfVisits
WHERE {
    ?business schema:checkinTime ?visit .
    BIND (day(?visit)  as ?day)
    BIND (month(?visit) as ?month)
    BIND (year(?visit) as ?year)
}
GROUP BY ?business ?year ?month ?day
ORDER BY DESC(COUNT(?visit))
LIMIT 5

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value,year.value,month.value,day.value,numberOfVisits.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/CySqUcNz8oPiQTu4EXTnig,2016,6,25,457
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/g50ImmCX3WY3koEDIzoKxg,2015,8,30,285
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/qfWWx0dVo1UuAhRfh03Dyw,2016,8,28,268
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/g50ImmCX3WY3koEDIzoKxg,2016,8,28,263
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FBBeJO50xZiNIo3oFhAFRA,2017,7,29,251


In [17]:
checkins["Day"] = checkins["date"].apply(lambda x: x.split("-")[2][:2])
checkins["Month"] = checkins["date"].apply(lambda x: x.split("-")[1])
checkins["Year"] = checkins["date"].apply(lambda x: x.split("-")[0])

checkins.value_counts(subset=["business_id", "Day", "Month", "Year"], sort=True, ascending=False).head(5)

business_id             Day  Month  Year
CySqUcNz8oPiQTu4EXTnig  25   06     2016    465
g50ImmCX3WY3koEDIzoKxg  30   08     2015    287
qfWWx0dVo1UuAhRfh03Dyw  28   08     2016    270
g50ImmCX3WY3koEDIzoKxg  28   08     2016    264
FBBeJO50xZiNIo3oFhAFRA  29   07     2017    254
dtype: int64

### CQ 9: Which are the top 10 most visisted businesses?

In [8]:
sparql_query = """
SELECT ?business COUNT(?visit) AS ?count_visits
WHERE {
    ?business schema:checkinTime ?visit .
}
GROUP BY ?business 
ORDER BY DESC(COUNT(?visit))
LIMIT 10

"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,business.value,count_visits.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/-QI8Qi8XWH3D8y8ethnajA,52129
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,40092
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/Eb1XmmLWyt_way5NNZ7-Pw,37553
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/c_4c5rJECZSfNgFj7frwHQ,37511
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/4i4kmYm9wgSNyF1b6gKphg,31163
5,https://purl.archive.org/purl/yelp/yelp_entities#business_id/8O35ji_yOMVJmZ6bl96yhQ,29599
6,https://purl.archive.org/purl/yelp/yelp_entities#business_id/VQcCL9PiNL_wkGf-uF3fjg,28917
7,https://purl.archive.org/purl/yelp/yelp_entities#business_id/ac1AeYqs8Z4_e2X5M3if2A,21527
8,https://purl.archive.org/purl/yelp/yelp_entities#business_id/QTbahs-GVuWYL5yfdjH34A,21478
9,https://purl.archive.org/purl/yelp/yelp_entities#business_id/ytynqOUb3hjKeJfRj5Tshw,18604


In [20]:
checkins.value_counts(subset=["business_id"], sort=True, ascending=False).head(10)

business_id           
-QI8Qi8XWH3D8y8ethnajA    52144
FEXhWNCMkv22qG04E83Qjg    40109
Eb1XmmLWyt_way5NNZ7-Pw    37562
c_4c5rJECZSfNgFj7frwHQ    37518
4i4kmYm9wgSNyF1b6gKphg    31168
8O35ji_yOMVJmZ6bl96yhQ    29606
VQcCL9PiNL_wkGf-uF3fjg    28927
ac1AeYqs8Z4_e2X5M3if2A    21542
QTbahs-GVuWYL5yfdjH34A    21487
ytynqOUb3hjKeJfRj5Tshw    18615
dtype: int64

### CQ 10: How many people have written a review on Yelp?

In [109]:
sparql_query = """
SELECT COUNT(DISTINCT(?user)) AS ?countUsers
WHERE {
    ?review schema:author ?user .
}
"""
run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,countUsers.value
0,1987929


In [7]:
reviews.drop_duplicates(subset=['user_id']).shape[0]

1987929

In [11]:
sparql_query = """
SELECT DISTINCT(?user)
WHERE {
    MINUS {
        ?user rdfs:Class schema:Person .
        }
    ?review schema:author ?user .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,user.value
0,https://purl.archive.org/purl/yelp/yelp_entities#user_id/dWZlWFtsEXFVq_vulT00lA
1,https://purl.archive.org/purl/yelp/yelp_entities#user_id/MaengE6zJ6k_d5e6nwnVaA
2,https://purl.archive.org/purl/yelp/yelp_entities#user_id/tquAg8GqbhN5k6Hkd23M0A
3,https://purl.archive.org/purl/yelp/yelp_entities#user_id/433BzxUeQAmRmK0g06UAfA
4,https://purl.archive.org/purl/yelp/yelp_entities#user_id/sxxnBQb15fOyg30JInIKqw
5,https://purl.archive.org/purl/yelp/yelp_entities#user_id/5iBVQ3OeK8lV4Z_4PXc1Xw
6,https://purl.archive.org/purl/yelp/yelp_entities#user_id/u8cq-5zzD7dPSa3LR8rIMw
7,https://purl.archive.org/purl/yelp/yelp_entities#user_id/5XiPz5mJK_RtJQVkXIqxYg
8,https://purl.archive.org/purl/yelp/yelp_entities#user_id/I200IyE9DCxJvvof2wnO6A
9,https://purl.archive.org/purl/yelp/yelp_entities#user_id/3N6-acEgosQSbipmBZKoSg


In [13]:
sparql_query = """
SELECT COUNT(DISTINCT(?user)) AS ?countUsers
WHERE {
    ?review schema:author ?user .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,countUsers.value
0,1987929


### CQ 11: How many users have 10 friends?

In [5]:
sparql_query = """
SELECT COUNT(*) as ?usersWith10Friends
WHERE {
    SELECT ?user COUNT(?friend) AS ?countUsers
    WHERE {
        ?user rdfs:Class schema:Person .
        ?user schema:knows ?friend .
    }
    GROUP BY ?user
    HAVING (COUNT(?friend) = 10)
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,callret-0.value
0,13579


In [11]:
users["amountFriends"] = users["friends"].apply(lambda x: len(x.split(",")))
users[users["amountFriends"] == 10].shape[0]

13579

### CQ 12: How many friends does a user have on average?
TO DO: Is not giving correct result

In [99]:
sparql_query="""
SELECT (xsd:double(?countFriends) / xsd:double(?countUser) AS ?averageFriends)
WHERE {
    {SELECT (COUNT(?user) AS ?countUser)
    WHERE {
        ?user rdfs:Class schema:Person .
        }}
    {SELECT (COUNT(?friend) AS ?countFriends)
    WHERE {
        ?user schema:knows ?friend .
        }}
}
"""

user_friend_counts = run_query(sparql_query, as_dataframe=True)

In [100]:
user_friend_counts

Unnamed: 0,countUser.value,countFriends.value,averageFriends.value
0,1987897,105225474,52.9331


In [42]:
# Import the necessary libraries
import json
import numpy as np
# Open the file containing the user data
with open(file="/home/ubuntu/OneDrive/DVML-P7/Data/yelp_academic_dataset_user.json", mode="r") as file:
    # Initialize an empty list to store the number of friends for each user
    number_of_friends = []
    # Iterate over each line in the file
    for line in file:
        # Parse the data from the line as a JSON object
        data = json.loads(line)
        # Extract the list of friends for the current user
        friend_ids = data['friends']
        # If the user has friends, append the number of friends to the list
        if friend_ids != 'None':
            number_of_friends.append(len(friend_ids.split(', ')))
        # If the user does not have friends, append 0 to the list
        else:
            number_of_friends.append(0)
# Calculate the mean number of friends
mean_num_friends = np.mean(number_of_friends)
# Print the result
print(mean_num_friends)

52.93306142119033

In [43]:
from collections import Counter

dict(Counter(number_of_friends))[0]

878551

In [45]:
len(number_of_friends)

1987897

In [44]:
sum(number_of_friends)

105225474

In [15]:
import json
import numpy as np
with open(file="/home/ubuntu/none_list_user.txt", mode="r") as file:
    number_of_nonefriends = []
    for line in file:
        if 'friend' in line:
            number_of_nonefriends.append(line)


In [31]:
len(number_of_nonefriends)

878551

### CQ 13: How many users have authored 10 reviews?

In [84]:
sparql_query = """
SELECT COUNT(DISTINCT(?user)) AS ?countUsers
WHERE {
    SELECT ?user COUNT(?review) as ?numberOfReviews
    WHERE {
        ?user rdfs:Class schema:Person .
        ?review schema:author ?user .
    }
    GROUP BY ?user
    HAVING (COUNT(?review) = 10)
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,countUsers.value
0,14119


In [17]:
# Count how many users have authored 10 reviews
reviews.groupby("user_id").size().reset_index(name="count").query("count == 10").shape[0]

14119

### CQ 14: How many reviews did users make in May 2018?

In [12]:
sparql_query = """
SELECT ?year ?month COUNT(?review) as ?countReviews
WHERE {
    ?review rdfs:Class schema:UserReview .
    ?review schema:dateCreated ?date .
    BIND (month(?date) as ?month) .
    BIND (year(?date) as ?year) .
    VALUES ?year {2018}
    VALUES ?month {5}
}
GROUP BY ?year ?month
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,year.value,month.value,countReviews.value
0,2018,5,79434


In [3]:
reviews['YEAR'] = reviews.date.dt.year
reviews['MONTH'] = reviews.date.dt.month
reviews.query("YEAR == 2018 & MONTH == 5").shape[0]

79434

In [126]:
from datetime import datetime
reviewers = 0
# Open JSON file for reading
with open(file=get_path("yelp_academic_dataset_review.json"), mode="r") as file:
    # Iterate through each line in the file
    for line in file:
        # Parse line as a dictionary
        data = json.loads(line)
        review_date = datetime.strptime(data['date'], '%Y-%m-%d %H:%M:%S')
        if review_date.year == 2018 and review_date.month == 5:
            reviewers += 1
reviewers

79434

### CQ 15: What parking options can a business provide?

In [27]:
sparql_query = """
SELECT DISTINCT ?p
WHERE {
    ?s rdfs:Class schema:ParkingFacility.
    ?s ?p ?parking .
    MINUS {
        ?s rdfs:Class ?parking.
        }
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,p.value
0,https://purl.archive.org/purl/yelp/ontology#hasgarage
1,https://purl.archive.org/purl/yelp/ontology#haslot
2,https://purl.archive.org/purl/yelp/ontology#hasstreet
3,https://purl.archive.org/purl/yelp/ontology#hasvalet
4,https://purl.archive.org/purl/yelp/ontology#hasvalidated


In [71]:
# Initialize empty list to store business parking options
BusinessParking = []
# Open JSON file for reading
with open(file=get_path("yelp_academic_dataset_business.json"), mode="r") as file:
    # Iterate through each line in the file
    for line in file:
        # Parse line as a dictionary
        data = json.loads(line)
        try:
            # Extract 'BusinessParking' value from dictionary
            _dict = data['attributes']['BusinessParking']
            # If 'BusinessParking' value is a string, modify it and parse as a dictionary
            if isinstance(_dict, str):
                _dict = _dict.replace("'", '"').replace("None", "null").replace('u"', '"').replace("True", "true").replace("False", "false") 
                _dict = json.loads(_dict)
            # Store modified dictionary in 'parkingopt' variable
            parkingopt = _dict
            # Append 'parkingopt' to 'BusinessParking' list
            BusinessParking.append(parkingopt)
        # If any errors are raised (e.g. missing keys), do nothing and continue to next iteration
        except:
            pass


In [72]:
# Initialize empty list to store parking option names
num_parkingopt = []
# Iterate through each business parking option dictionary
for parkings in BusinessParking:
    # If the dictionary is not empty (i.e. is not 'None')
    if parkings is not None:
        # Iterate through each key (i.e. parking option name) in the dictionary
        for parkingopt in parkings.keys():
            # Append the parking option name to the list
            num_parkingopt.append(parkingopt)
# Convert list to a set to remove duplicate values, and assign to 'set(num_parkingopt)'
set(num_parkingopt)


{'garage', 'lot', 'street', 'valet', 'validated'}

### CQ 16: How many businesses has karaoke music?

In [89]:
sparql_query = """
SELECT COUNT(DISTINCT ?business) AS ?businessesWithKaraoke
WHERE {
    ?business yelpont:hasMusic ?blank .
    ?blank yelpont:haskaraoke 1 .
}
"""

run_query(query=sparql_query, as_dataframe=True)

Unnamed: 0,businessesWithKaraoke.value
0,75


In [106]:
from collections import Counter
# Initialize a list to store the karaoke values found in the input file
karaoke_values = []

# Open the input file for reading
with open(file=get_path("yelp_academic_dataset_business.json"), mode="r") as file:
    # Iterate over each line in the file
    for line in file:
        # Load the JSON data from the line
        data = json.loads(line)
        try:
            # Extract the Music dictionary from the data
            music_dict = data['attributes']['Music']
            # Check if the Music value is a string
            if isinstance(music_dict, str):
                # Replace various substrings in the Music string with their JSON equivalent
                music_dict = music_dict.replace("'", '"').replace("None", "null").replace('u"', '"').replace("True", "true").replace("False", "false") 
                # Parse the Music string as JSON
                music_dict = json.loads(music_dict)
            # Extract the karaoke value from the Music dictionary
            karaoke = music_dict['karaoke']
            # Add the karaoke value to the list
            karaoke_values.append(karaoke)
        except:
            # Catch any exceptions that may be raised and do nothing
            pass
# Count the number of True values in the list
dict(Counter(karaoke_values))[True]