In [8]:
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])
import os
import gzip
import datetime
import numpy as np
import pandas as pd
from collections import Counter
from rdflib import Namespace, Graph, URIRef, Literal, XSD
from rdflib.namespace import RDFS

from Code.UtilityFunctions.wikidata_functions import wikidata_query, get_subclass_of_wikientity, category_query, min_qid, categories_dict_singular
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.string_functions import space_words_lower
from Code.UtilityFunctions.run_query import run_query

In [17]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()),
                                columns=['category', 'occurences'
                                        ]).sort_values(by='occurences',
                                                        ascending=False)
# Maps the split categories to the original categories
category_occurences['split_category'] = category_occurences['category'].map(categories_dict_singular(categories))
category_occurences = category_occurences.explode('split_category')

# Maps the yelp categories that are already mapped to a schemaType to the original category.
class_mapping = pd.read_csv(get_path('class_mappings.csv'))
class_mapping['SchemaType'] = class_mapping['SchemaType'].apply(lambda x: eval(x)[0])
category_occurences['split_category'] = category_occurences['split_category'].apply(lambda x: x.title().replace(' ', ''))
category_occurences = category_occurences.merge(class_mapping,
                                            left_on='category',
                                            right_on='YelpCategory',
                                            how='left').drop(columns=['YelpCategory'])
category_occurences['schema_or_yelp_category'] = category_occurences['SchemaType'].fillna(category_occurences['split_category'])



In [18]:
category_occurences

Unnamed: 0,category,occurences,split_category,SchemaType,schema_or_yelp_category
0,Restaurants,52268,Restaurant,Restaurant,Restaurant
1,Food,27781,Food,,Food
2,Shopping,24395,Shopping,Retail,Retail
3,Home Services,14356,HomeService,Service,Service
4,Beauty & Spas,14292,Beauty,DaySpa,DaySpa
...,...,...,...,...,...
1422,Beach Bars,1,BeachBar,Beach,Beach
1423,DUI Schools,1,DuiSchool,,DuiSchool
1424,Patent Law,1,PatentLaw,,PatentLaw
1425,Housing Cooperatives,1,HousingCooperative,,HousingCooperative


In [21]:
cat = space_words_lower('coffee')
wikidata_cat_query = wikidata_query(category_query(
category=cat))  # Querys wikidata for the QID of the category

wikidata_cat_query

Unnamed: 0,item.type,item.value,itemLabel.xml:lang,itemLabel.type,itemLabel.value,itemDescription.xml:lang,itemDescription.type,itemDescription.value
0,uri,http://www.wikidata.org/entity/Q8486,en,literal,coffee,en,literal,brewed beverage made from seeds of Coffea genus
1,uri,http://www.wikidata.org/entity/Q59047,en,literal,ristretto,en,literal,coffee
2,uri,http://www.wikidata.org/entity/Q3849770,en,literal,Marocchino,en,literal,coffee
3,uri,http://www.wikidata.org/entity/Q2932937,en,literal,coffee,en,literal,color


In [22]:
qid = min_qid(wikidata_cat_query)
i = ['Q1', 'Q2', 'Q3', 'Q4']
l = [j for j in i if j in ['Q1', 'Q2']]
qid

('Q8486', 'coffee')

In [32]:
query = f"""
  SELECT DISTINCT ?instanceOf ?instanceOfLabel
  WHERE {{
    VALUES ?item {{wd:{qid[0]}}}
    ?item wdt:P31 ?instanceOf .
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  }}
"""
instance_of = wikidata_query(query)[['instanceOf.value', 'instanceOfLabel.value']]

In [30]:
instance_of.apply(lambda x: x[0].split('/')[-1], axis=1)

In [33]:
instance_of

Unnamed: 0,instanceOf.value,instanceOfLabel.value
0,http://www.wikidata.org/entity/Q63922515,hyperlocal manufacturing


In [25]:
# Query Wikidata for the QID of the split categories
category_qid2 = {}
for cat in category_occurences.itertuples():
    try:
        cat = space_words_lower(cat.schema_or_yelp_category)
        wikidata_cat_query = wikidata_query(category_query(category=cat))
        category_qid2[cat] = min_qid(wikidata_cat_query)
    except:
        pass
category_qid2

In [3]:
sparql_query = """
SELECT COUNT(*) as ?totalTriples
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,totalTriples.value
0,242247039


In [4]:
sparql_query = """
SELECT (COUNT(DISTINCT ?s) as ?numSubjects)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numSubjects.value
0,10495838


In [14]:
sparql_query = """
SELECT (COUNT(DISTINCT ?p) as ?numPredicates)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numPredicates.value
0,144


In [6]:
sparql_query = """
SELECT COUNT(DISTINCT ?o) as ?numObjects
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
"""
run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numObjects.value
0,61450383


In [7]:
sparql_query = """
SELECT ?p COUNT(?p) as ?predicateCount
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
}
GROUP BY ?p
ORDER BY DESC(?predicateCount)
LIMIT 2
"""
run_query(sparql_query, as_dataframe=True)


Unnamed: 0,p.value,predicateCount.value
0,https://schema.org/knows,105225474
1,https://schema.org/checkinTime,13353332


In [8]:
sparql_query = """
SELECT ?class (COUNT(DISTINCT ?s) as ?numSubjects)
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s rdfs:Class ?class .
}
GROUP BY ?class
ORDER BY DESC(?numSubjects)
LIMIT 2
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,class.value,numSubjects.value
0,https://schema.org/UserReview,6990280
1,https://schema.org/Person,1987897


In [9]:
sparql_query = """
SELECT AVG(?outdegree) as ?avgOutdegree
WHERE{
  SELECT ?s COUNT(DISTINCT ?o) as ?outdegree
  FROM <http://www.yelpkg.com/yelp_kg>
  WHERE {
    ?s ?p ?o .
  }
  GROUP BY ?s
}
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,avgOutdegree.value
0,19.715217498593248


In [18]:
sparql_query = """
SELECT ?s ?p ?o
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
?s ?p ?o .
VALUES ?s {<https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg>}
}
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,s.value,p.value,o.value
0,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,http://www.w3.org/2000/01/rdf-schema#Class,https://schema.org/LocalBusiness
1,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/address,800 Decatur St
2,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/category,https://purl.archive.org/purl/yelp/business_categories#Souvenir_Shops
3,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/category,https://purl.archive.org/purl/yelp/business_categories#Cafes
4,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://schema.org/category,https://purl.archive.org/purl/yelp/business_categories#Coffee_&_Tea
...,...,...,...
40131,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#hasHours,Ne63b1901af6c4e22b0aca14a589c66b1
40132,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#hasBusinessParking,N70f12339b640454b926de603ce49059a
40133,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#hasGoodForMeal,N9624dce6c6e242ca92ba434792a9d1c0
40134,https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg,https://purl.archive.org/purl/yelp/ontology#locatedInCity,New Orleans


In [11]:
sparql_query = """
SELECT COUNT(DISTINCT ?p) as ?numPredicates COUNT(DISTINCT ?o) as ?numObjects
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
  ?s ?p ?o .
  VALUES ?s {<https://purl.archive.org/purl/yelp/yelp_entities#business_id/FEXhWNCMkv22qG04E83Qjg>}
}
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,numPredicates.value,numObjects.value
0,37,40124


In [17]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [32]:
sparql_query = """
SELECT ?s ?p ?o
FROM <http://www.yelpkg.com/yelp_kg>
WHERE {
    VALUES ?p {yelpont:hasMonday}
   ?s ?p ?o.
}
"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,s.value,p.value,o.value
0,N00a09517b21c44b489196de36e857a4c,https://purl.archive.org/purl/yelp/ontology#hasMonday,0:0-0:0
1,N00d12a77116645fd8e043277c0cf918d,https://purl.archive.org/purl/yelp/ontology#hasMonday,0:0-0:0
2,N00da3487ce9e46a9977c8b2bdb644043,https://purl.archive.org/purl/yelp/ontology#hasMonday,0:0-0:0
3,N025d032142464193a07a00f384d26e51,https://purl.archive.org/purl/yelp/ontology#hasMonday,0:0-0:0
4,N035eb2e66cab41f7b258886111054406,https://purl.archive.org/purl/yelp/ontology#hasMonday,0:0-0:0
...,...,...,...
114469,N6947230829f847359e41baf95b709b7d,https://purl.archive.org/purl/yelp/ontology#hasMonday,6:0-1:59
114470,Nb0cef65b2b1e484596b3d50a297dc34a,https://purl.archive.org/purl/yelp/ontology#hasMonday,8:0-17:50
114471,N89d59afc91cd4a78b76705a2d636b54e,https://purl.archive.org/purl/yelp/ontology#hasMonday,7:0-21:50
114472,Ncc14d76d191d49b59ef714c013f93d9c,https://purl.archive.org/purl/yelp/ontology#hasMonday,7:30-19:45


In [44]:
sparql_query="""SELECT (AVG(?friendCount) AS ?averageFriends)
WHERE {
  {
    SELECT ?user (COUNT(?friend) AS ?friendCount)
    WHERE {
      ?user rdfs:Class schema:Person .
      OPTIONAL{
      ?user schema:knows ?friend .}
    }
    GROUP BY ?user
  }
}"""

run_query(sparql_query, as_dataframe=True)

Unnamed: 0,averageFriends.value
0,52.93306142119033


In [43]:
sparql_query="""
    SELECT ?user (COUNT(?friend) AS ?friendCount)
    WHERE {
      ?user rdfs:Class schema:Person .
      OPTIONAL{
      ?user schema:knows ?friend .
      }
    }
    GROUP BY ?user
"""

x = run_query(sparql_query, as_dataframe=True)

In [39]:
x

Unnamed: 0,user.value,friendCount.value
0,https://purl.archive.org/purl/yelp/yelp_entiti...,239
1,https://purl.archive.org/purl/yelp/yelp_entiti...,147
2,https://purl.archive.org/purl/yelp/yelp_entiti...,29
3,https://purl.archive.org/purl/yelp/yelp_entiti...,213
4,https://purl.archive.org/purl/yelp/yelp_entiti...,35
...,...,...
1048571,https://purl.archive.org/purl/yelp/yelp_entiti...,0
1048572,https://purl.archive.org/purl/yelp/yelp_entiti...,0
1048573,https://purl.archive.org/purl/yelp/yelp_entiti...,63
1048574,https://purl.archive.org/purl/yelp/yelp_entiti...,0


In [41]:
x['friendCount.value'].apply(eval).sum()/(len(x))

54.85585021972656

In [30]:
len(x)

1048576

In [20]:
import json
import numpy as np
with open(file="/home/ubuntu/OneDrive/DVML-P7/Data/yelp_academic_dataset_user.json", mode="r") as file:
    number_of_friends = []
    for line in file:
        data = json.loads(line)
        number_of_friends.append(len(data['friends'].split(', ')))
np.mean(number_of_friends)

53.375011381374385

In [29]:
len(number_of_friends)

1987897

In [4]:
import json
import numpy as np
with open(file="/home/ubuntu/none_list_user.txt", mode="r") as file:
    number_of_nonefriends = []
    for line in file:
        if 'friend' in line:
            number_of_nonefriends.append(line)


In [36]:
len(x) + len(number_of_nonefriends) - len(number_of_friends)

-60770

In [30]:
sparql_query="""
    SELECT ?user ?friend
    WHERE {
      ?user rdfs:Class schema:Person .
      ?user schema:knows ?friend .
    }
    limit 100
"""

run_query(sparql_query, as_dataframe=True, include_types=True)

Unnamed: 0,user.type,user.value,friend.type,friend.datatype,friend.value
0,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,y6FOE7flsCXPncosjWVnQQ
1,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,Yj4SpqG0OJxbg8L1Qsrlsw
2,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,XhrlkiNB867EmtYuDTcq7Q
3,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,AM0QOXmg89jcQnQB9jZJoA
4,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,Onpce5OsIaeb6W1uNwjN0g
...,...,...,...,...,...
95,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,vxLl64e-LBy7JCUqkyvHUg
96,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,5iPqjzu6HGBPXBVaSf-KHQ
97,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,0193YItGi_9WfqVjNZlTLA
98,uri,https://purl.archive.org/purl/yelp/yelp_entiti...,typed-literal,http://www.w3.org/2001/XMLSchema#string,0a2wudzHBKueU7-JfLnddw
