In [1]:
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])
import os
import gzip
import datetime
import numpy as np
import pandas as pd
from collections import Counter
from rdflib import Namespace, Graph, URIRef, Literal, XSD
from rdflib.namespace import RDFS

from Code.UtilityFunctions.wikidata_functions import wikidata_query, get_subclass_of_wikientity, category_query, min_qid, categories_dict_singular
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.string_functions import space_words_lower
from Code.UtilityFunctions.run_query import run_query

In [24]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()),
                                columns=['category', 'occurences'
                                        ]).sort_values(by='occurences',
                                                        ascending=False)
# Maps the split categories to the original categories
category_occurences['split_category'] = category_occurences['category'].map(categories_dict_singular(categories))
category_occurences = category_occurences.explode('split_category')

# Maps the yelp categories that are already mapped to a schemaType to the original category.
class_mapping = pd.read_csv(get_path('class_mappings_manual.csv'))
class_mapping['SchemaType'] = class_mapping['SchemaType'].apply(lambda x: eval(x)[0])
category_occurences['split_category'] = category_occurences['split_category'].apply(lambda x: x.title().replace(' ', ''))
category_occurences = category_occurences.merge(class_mapping,
                                            left_on='category',
                                            right_on='YelpCategory',
                                            how='left').drop(columns=['YelpCategory'])
category_occurences['schema_or_yelp_category'] = category_occurences['SchemaType'].fillna(category_occurences['split_category'])



In [25]:
category_occurences

Unnamed: 0,category,occurences,split_category,SchemaType,schema_or_yelp_category
0,Restaurants,52268,Restaurant,Restaurant,Restaurant
1,Food,27781,Food,,Food
2,Shopping,24395,Shopping,Retail,Retail
3,Home Services,14356,HomeService,,HomeService
4,Beauty & Spas,14292,Beauty,DaySpa,DaySpa
...,...,...,...,...,...
1422,Beach Bars,1,BeachBar,,BeachBar
1423,DUI Schools,1,DuiSchool,,DuiSchool
1424,Patent Law,1,PatentLaw,,PatentLaw
1425,Housing Cooperatives,1,HousingCooperative,,HousingCooperative


In [26]:
cat = space_words_lower('organization')
wikidata_cat_query = wikidata_query(category_query(
category=cat))  # Querys wikidata for the QID of the category

wikidata_cat_query

Unnamed: 0,item.type,item.value,itemLabel.xml:lang,itemLabel.type,itemLabel.value,itemDescription.xml:lang,itemDescription.type,itemDescription.value
0,uri,http://www.wikidata.org/entity/Q43229,en,literal,organization,en,literal,social entity established to meet needs or pur...
1,uri,http://www.wikidata.org/entity/Q190864,en,literal,central securities depository,en,literal,organization
2,uri,http://www.wikidata.org/entity/Q266706,en,literal,Jewish Combat Organization,en,literal,organization
3,uri,http://www.wikidata.org/entity/Q2985586,en,literal,African Sports Confederation of Disabled,en,literal,organization
4,uri,http://www.wikidata.org/entity/Q2997776,en,literal,Swedish Volunteer Corps,en,literal,organization
5,uri,http://www.wikidata.org/entity/Q2895553,en,literal,St. Petersburg International Economic Forum,en,literal,organization
6,uri,http://www.wikidata.org/entity/Q3067772,en,literal,Nevers manufactory,en,literal,organization
7,uri,http://www.wikidata.org/entity/Q2914877,en,literal,Sociedad Económica de los Amigos del País,en,literal,organization
8,uri,http://www.wikidata.org/entity/Q1203564,en,literal,Mineralogical Society of Germany,en,literal,organization
9,uri,http://www.wikidata.org/entity/Q1359821,en,literal,Commando Companies (Germany),en,literal,organization


In [27]:
qid = min_qid(wikidata_cat_query)
i = ['Q1', 'Q2', 'Q3', 'Q4']
l = [j for j in i if j in ['Q1', 'Q2']]
qid

('Q43229', 'organization')

In [28]:
qid[0]

'Q43229'

In [29]:
qid

('Q43229', 'organization')

In [30]:
def instance_of_query(qid: str):
    query = f"""
    SELECT DISTINCT ?instanceOf ?instanceOfLabel
    WHERE {{
      VALUES ?item {{wd:{qid[0]}}}
      ?item wdt:P31 ?instanceOf .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    instance_of = wikidata_query(query)
    return instance_of

instance_of_query = instance_of_query(qid)

In [35]:
test = {}
if not instance_of_query.empty:
    test['a'] = wikidata_cat_query.loc[wikidata_cat_query['item.value'] == instance_of_query['instanceOf.value'][0]][['item.value', 'itemLabel.value']].apply(lambda x: (x[0].split('/')[-1], x[1]), axis=1)[0]
test

{'a': 0    (Q43229, organization)
 dtype: object}

Unnamed: 0,item.value,itemLabel.value


In [2]:
import polars as pl

In [3]:
df = pl.scan_ndjson(get_path("yelp_academic_dataset_review.json"))

In [4]:
df

In [5]:
user_groupby = df.select('user_id').groupby('user_id').agg([pl.count()]).filter(pl.col('count') == 10)

In [6]:
pl.Config.set_fmt_str_lengths(n=1000)

polars.cfg.Config

In [9]:
user_groupby.collect()

user_id,count
str,u32
"""rvIfsVZTtak3BzxR87UuSQ""",10
"""asgZKo4Mkw8IU1KtZAnyug""",10
"""ufRiQbu5a-fvEW18SIRVqg""",10
"""_RCJEIa3z2tGJBl2RtYbhA""",10
"""6nMdYLVemz66PHGvMGmW5g""",10
"""GVJ2uKNoclob3QvHiQTMrQ""",10
"""DffAhz9tsnT4wUYdY9I3sg""",10
"""NdD5H7oSeG9Bx64e9AA2lg""",10
"""qTR0K7rrRMlM_-o8IqT-FA""",10
"""9RxgYH6vBoqGMO7zwLLOww""",10
