In [8]:
import gzip
import json

from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import RDFS

from UtilityFunctions.flatten_dict import flatten_dictionary
from UtilityFunctions.get_data_path import get_path
from UtilityFunctions.schema_functions import get_schema_predicate, get_schema_type

In [9]:
schema = Namespace("https://schema.org/")
example = Namespace("https://example.org/")

In [10]:
business_uri = Namespace("https://www.yelp.com/biz/")
user_uri = Namespace("https://www.yelp.com/user_details?userid=")

In [None]:
def create_nt_file(file_name: str):
    entity_name = file_name[22:-5]
    triple_file = gzip.open(filename=f"/home/ubuntu/vol1/yelp_{entity_name}.nt.gz", mode="at", encoding="utf-8")
    file_path = get_path(file_name)
    with open(file=file_path, mode="r") as file:
        for line in file:
            try:
                G = Graph()
                if file_name in ["yelp_academic_dataset_business.json", "yelp_academic_dataset_checkin.json",
                                 "yelp_academic_dataset_review.json"]:
                    uri = business_uri
                else:  # user
                    uri = user_uri
                line = json.loads(line)

                json_key = list(line.keys())[0]  # Key of subject
                subject = line[json_key]
                del line[json_key]

                if file_name == "yelp_academic_dataset_review.json":
                    subject = line['business_id'] + "?hrid=" + subject  # Other uri for review
                    G.add(triple=(URIRef(user_uri + line["user_id"]),  # Subject
                                  URIRef(schema + "author"),  # Predicate
                                  URIRef(business_uri + subject)))  # Object
                    del line["user_id"]

                line = flatten_dictionary(line)  # Flattens the nested dictionary
                if file_name != 'yelp_academic_dataset_checkin.json':
                    G.add(triple=(URIRef(uri + subject),
                                  RDFS.Class,
                                  URIRef(get_schema_type(entity_name))))

                for _predicate, _object in line.items():
                    if isinstance(_object, type(None)) or str(_object).lower() == "none":
                        pass

                    elif _predicate in ["categories", "date", "friends", "elite"]:  # String containing listed objects
                        _object = str(_object)
                        obj_lst = _object.split(", ") if _predicate != "elite" else _object.split(",")

                        predicate, object_type = get_schema_predicate(_predicate, _object, file_name)
                        for obj in obj_lst:
                            if _predicate == "date":
                                obj = obj.replace(" ", "T")
                            G.add(triple=(URIRef(uri + subject),  # Subject
                                          URIRef(predicate),  # Predicate
                                          Literal(obj, datatype=object_type)))  # Object

                    else:
                        if _predicate == "yelping_since":
                            _object = _object.replace(" ", "T")
                        predicate, object_type = get_schema_predicate(_predicate, _object, file_name)
                        G.add(triple=(URIRef(uri + subject),  # Subject
                                      URIRef(predicate),  # Predicate
                                      Literal(_object, datatype=object_type)))  # Object
                triple_file.write(G.serialize(format='nt'))
            except Exception as e:
                print(e)
                print(subject, _predicate, _object)
    triple_file.close()

In [11]:
for file_name in ["yelp_academic_dataset_business.json", "yelp_academic_dataset_checkin.json",
                  "yelp_academic_dataset_review.json", "yelp_academic_dataset_user.json"]:
    create_nt_file(file_name=file_name)


KeyboardInterrupt



In [None]:
def create_tip_nt_file():
    file_name = "yelp_academic_dataset_tip.json"
    entity_name = file_name[22:-5]
    file_path = get_path(file_name)
    triple_file = gzip.open(filename=f"/home/ubuntu/vol1/yelp_{entity_name}.nt.gz", mode="at", encoding="utf-8")
    with open(file=file_path, mode="r") as file:
        for line in file:
            try:
                G = Graph()
                line = json.loads(line)

                b_node = BNode()

                subject = line["user_id"]
                del line["user_id"]

                # user, author, b_node
                G.add(triple=(URIRef(user_uri + subject),  # Subject
                              URIRef(schema + "author"),  # Predicate
                              Literal(b_node)))  # Object

                G.add(triple=(URIRef(b_node),
                                      RDFS.Class,
                                      URIRef(get_schema_type(entity_name))))

                for _predicate, _object in line.items():
                    predicate, object_type = get_schema_predicate(_predicate, _object, file_name)

                    if _predicate == "date":
                        obj = _object.replace(" ", "T")
                    elif _predicate == "business_id":
                        obj = business_uri + _object
                    else:
                        obj = _object

                    G.add(triple=(URIRef(b_node),  # Subject
                                  URIRef(predicate),  # Predicate
                                  Literal(obj, datatype=object_type)))  # Object
                triple_file.write(G.serialize(format="nt"))
            except Exception as e:
                print(e)
                print(subject, _predicate, _object)
    triple_file.close()

In [7]:
import gzip

with gzip.open('/home/ubuntu/vol1/yelp_business.nt.gz', 'rt') as f:
    for i in range(100):
        line = f.readline()
        print(line)

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <http://www.w3.org/2000/01/rdf-schema#Class> <https://schema.org/LocalBusiness> .

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <https://schema.org/location> "CA"^^<http://www.w3.org/2001/XMLSchema#string> .

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <https://schema.org/reviewCount> "7"^^<http://www.w3.org/2001/XMLSchema#integer> .

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <https://schema.org/latitude> "34.4266787"^^<http://www.w3.org/2001/XMLSchema#float> .

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <https://schema.org/location> "Santa Barbara"^^<http://www.w3.org/2001/XMLSchema#string> .

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <https://schema.org/publicAccess> "0"^^<http://www.w3.org/2001/XMLSchema#string> .

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <https://schema.org/category> "Doctors"^^<http://www.w3.org/2001/XMLSchema#string> .

<https://www.yelp.com/biz/Pns2l4eNsfO8kk83dixA6A> <h