### Create graph with current version of morph_kgc

In [103]:
import morph_kgc

In [104]:
config = """
             [GTFS-Madrid-Bench]
             mappings: mapping.csv.ttl
         """

In [105]:
g = morph_kgc.materialize(config)

INFO | 2023-05-03 14:07:46,377 | Parallelization is not supported for win32 when running as a library. If you need to speed up your data integration pipeline, please run through the command line.
INFO | 2023-05-03 14:07:47,628 | 156 mapping rules retrieved.
INFO | 2023-05-03 14:07:47,693 | Mapping partition with 81 groups generated.
INFO | 2023-05-03 14:07:47,694 | Maximum number of rules within mapping group: 14.
INFO | 2023-05-03 14:07:47,694 | Mappings processed in 1.315 seconds.
INFO | 2023-05-03 14:07:48,876 | Number of triples generated in total: 2001.


### Mapping file tests

In [109]:
import rdflib

In [152]:
g = rdflib.Graph().parse('mapping.csv.ttl')

In [153]:
len(g)

942

In [144]:
q = """
        PREFIX rml: <http://semweb.mmlab.be/ns/rml#>
        PREFIX sd: <https://w3id.org/okn/o/sd/>
        PREFIX ql: <http://semweb.mmlab.be/ns/ql#>
            
        DELETE {
            ?source rml:source ?source_file.
            ?source ?r ?t.
        }
        INSERT {
            #?source rml:source ?source_file. # TODO: remove this
            ?source a rml:LogicalSource;
                rml:source [
                    a sd:DatasetSpecification;
                    sd:name ?source_file;
                    sd:hasDataTransformation [
                        sd:hasSoftwareRequirements "pandas>=1.1.0";
                        sd:hasSourceCode [
                            sd:programmingLanguage "Python3.9";
			            ];
		            ];
                ].
        }
        WHERE {
            ?source rml:source ?source_file.
        }
      """
q_res = g.update(q)

In [141]:
len(g)

1098

In [154]:
q = """
         PREFIX rml: <http://semweb.mmlab.be/ns/rml#>

         SELECT ?source {
            ?h rml:source ?source
         }
      """

q_res = g.query(q)

for row in q_res:
    print(row['source'])

data/STOP_TIMES.csv
data/TRIPS.csv
data/ROUTES.csv
data/AGENCY.csv
data/STOPS.csv
data/CALENDAR.csv
data/CALENDAR.csv
data/CALENDAR_DATES.csv
data/CALENDAR_DATES.csv
data/FEED_INFO.csv
data/SHAPES.csv
data/SHAPES.csv
data/FREQUENCIES.csv


In [17]:
q = """
         PREFIX rml: <http://semweb.mmlab.be/ns/rml#>

         SELECT ?r ?t {
            :source_000 ?r ?t
         }
      """

q_res = g.query(q)

for row in q_res:
    print(row['r'], row['t'])

http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://semweb.mmlab.be/ns/rml#LogicalSource
http://semweb.mmlab.be/ns/rml#source data/STOP_TIMES.csv
http://semweb.mmlab.be/ns/rml#referenceFormulation http://semweb.mmlab.be/ns/ql#CSV


# Incremental KG construction tests

In [1]:
import sys
sys.path.append('../src/')

In [5]:
import incremental_kg as inc

## Aux to disk

#### Create new graph

In [4]:
g = inc.load_kg_aux_to_disk(
    aux_data_path='.aux',
    mapping_file='mapping.csv.ttl',
    snapshot_file='snapshot.pkl',
    old_graph=None)

INFO | 2023-05-03 14:31:00,740 | Parallelization is not supported for win32 when running as a library. If you need to speed up your data integration pipeline, please run through the command line.


Found new data in data/STOPS.csv, saved to file .aux/data/STOPS.csv.
Found new data in data/FREQUENCIES.csv, saved to file .aux/data/FREQUENCIES.csv.
Found new data in data/SHAPES.csv, saved to file .aux/data/SHAPES.csv.
Found new data in data/ROUTES.csv, saved to file .aux/data/ROUTES.csv.
Found new data in data/TRIPS.csv, saved to file .aux/data/TRIPS.csv.
Found new data in data/AGENCY.csv, saved to file .aux/data/AGENCY.csv.
Found new data in data/STOP_TIMES.csv, saved to file .aux/data/STOP_TIMES.csv.
Found new data in data/CALENDAR.csv, saved to file .aux/data/CALENDAR.csv.
Found new data in data/CALENDAR_DATES.csv, saved to file .aux/data/CALENDAR_DATES.csv.
Found new data in data/FEED_INFO.csv, saved to file .aux/data/FEED_INFO.csv.
Saved snapshot to snapshot.pkl


INFO | 2023-05-03 14:31:01,898 | 156 mapping rules retrieved.
INFO | 2023-05-03 14:31:01,957 | Mapping partition with 81 groups generated.
INFO | 2023-05-03 14:31:01,957 | Maximum number of rules within mapping group: 14.
INFO | 2023-05-03 14:31:01,958 | Mappings processed in 1.218 seconds.
INFO | 2023-05-03 14:31:03,008 | Number of triples generated in total: 2001.


In [5]:
len(g)

2001

#### Add new triples

New total should be >= previous

In [6]:
g = inc.load_kg_aux_to_disk(
    aux_data_path='.aux',
    mapping_file='mapping.csv.ttl',
    snapshot_file='snapshot.pkl',
    old_graph=g)

INFO | 2023-05-03 14:31:38,915 | Parallelization is not supported for win32 when running as a library. If you need to speed up your data integration pipeline, please run through the command line.


No new data in data/STOPS.csv, created empty file .aux/data/STOPS.csv.
No new data in data/FREQUENCIES.csv, created empty file .aux/data/FREQUENCIES.csv.
No new data in data/SHAPES.csv, created empty file .aux/data/SHAPES.csv.
No new data in data/ROUTES.csv, created empty file .aux/data/ROUTES.csv.
No new data in data/TRIPS.csv, created empty file .aux/data/TRIPS.csv.
Found new data in data/AGENCY.csv, saved to file .aux/data/AGENCY.csv.
No new data in data/STOP_TIMES.csv, created empty file .aux/data/STOP_TIMES.csv.
No new data in data/CALENDAR.csv, created empty file .aux/data/CALENDAR.csv.
No new data in data/CALENDAR_DATES.csv, created empty file .aux/data/CALENDAR_DATES.csv.
No new data in data/FEED_INFO.csv, created empty file .aux/data/FEED_INFO.csv.
Saved snapshot to snapshot.pkl


INFO | 2023-05-03 14:31:40,069 | 156 mapping rules retrieved.
INFO | 2023-05-03 14:31:40,128 | Mapping partition with 81 groups generated.
INFO | 2023-05-03 14:31:40,129 | Maximum number of rules within mapping group: 14.
INFO | 2023-05-03 14:31:40,130 | Mappings processed in 1.214 seconds.
INFO | 2023-05-03 14:31:41,024 | Number of triples generated in total: 7.


In [7]:
len(g)

2008

## Aux to mem

#### Create new graph

In [7]:
g = inc.load_kg_aux_to_mem(
    mapping_file='mapping.csv.ttl',
    snapshot_file='snapshot.pkl',
    old_graph=None)

No new data in data/STOPS.csv, created empty dataframe.
No new data in data/FEED_INFO.csv, created empty dataframe.
No new data in data/SHAPES.csv, created empty dataframe.
No new data in data/FREQUENCIES.csv, created empty dataframe.
No new data in data/ROUTES.csv, created empty dataframe.
No new data in data/CALENDAR_DATES.csv, created empty dataframe.
No new data in data/CALENDAR.csv, created empty dataframe.
No new data in data/STOP_TIMES.csv, created empty dataframe.
No new data in data/AGENCY.csv, created empty dataframe.
No new data in data/TRIPS.csv, created empty dataframe.
Saved snapshot to snapshot.pkl


TypeError: materialize() got an unexpected keyword argument 'python_source'

In [None]:
len(g)

2001

#### Add new triples

New total should be >= previous

In [None]:
g = inc.load_kg_aux_to_mem(
    mapping_file='mapping.csv.ttl',
    snapshot_file='snapshot.pkl',
    old_graph=g)

INFO | 2023-05-03 14:31:38,915 | Parallelization is not supported for win32 when running as a library. If you need to speed up your data integration pipeline, please run through the command line.


No new data in data/STOPS.csv, created empty file .aux/data/STOPS.csv.
No new data in data/FREQUENCIES.csv, created empty file .aux/data/FREQUENCIES.csv.
No new data in data/SHAPES.csv, created empty file .aux/data/SHAPES.csv.
No new data in data/ROUTES.csv, created empty file .aux/data/ROUTES.csv.
No new data in data/TRIPS.csv, created empty file .aux/data/TRIPS.csv.
Found new data in data/AGENCY.csv, saved to file .aux/data/AGENCY.csv.
No new data in data/STOP_TIMES.csv, created empty file .aux/data/STOP_TIMES.csv.
No new data in data/CALENDAR.csv, created empty file .aux/data/CALENDAR.csv.
No new data in data/CALENDAR_DATES.csv, created empty file .aux/data/CALENDAR_DATES.csv.
No new data in data/FEED_INFO.csv, created empty file .aux/data/FEED_INFO.csv.
Saved snapshot to snapshot.pkl


INFO | 2023-05-03 14:31:40,069 | 156 mapping rules retrieved.
INFO | 2023-05-03 14:31:40,128 | Mapping partition with 81 groups generated.
INFO | 2023-05-03 14:31:40,129 | Maximum number of rules within mapping group: 14.
INFO | 2023-05-03 14:31:40,130 | Mappings processed in 1.214 seconds.
INFO | 2023-05-03 14:31:41,024 | Number of triples generated in total: 7.


In [None]:
len(g)

2008

#### Test query

In [8]:
q3 = """
         PREFIX gtfs: <http://vocab.gtfs.org/terms#>

         SELECT ?agency ?url WHERE {
            ?agency a gtfs:Agency.
            ?agency gtfs:fareUrl ?url
         }
      """

q3_res = g.query(q3)

for row in q3_res:
    print(row['agency'], row['url'])

http://transport.linkeddata.es/madrid/agency/00000000000000000001 https://www.crtm.es/billetes-y-tarifas
http://transport.linkeddata.es/madrid/agency/00000000000000000002 https://www.crtm.es/billetes-y-tarifasaa


In [9]:
q3 = """
         PREFIX gtfs: <http://vocab.gtfs.org/terms#>
         PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
         PREFIX dct: <http://purl.org/dc/terms/>

         SELECT * WHERE {
             ?stop a gtfs:Stop . 
             ?stop gtfs:locationType ?location .
             OPTIONAL { ?stop dct:description ?stopDescription . }
             OPTIONAL { 
                 ?stop geo:lat ?stopLat . 
                 ?stop geo:long ?stopLong .
             }
             OPTIONAL {?stop gtfs:wheelchairAccessible ?wheelchairAccessible . }
             FILTER (?location=<http://transport.linkeddata.es/resource/LocationType/2>)
         }
      """

q3_res = g.query(q3)

for row in q3_res:
    print(row['stop'], row['stopLat'], row['stopLong'])

http://transport.linkeddata.es/madrid/metro/stops/000000000000000000ho 929.0 889.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000dr 697.0 657.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000qt 750.0 710.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000e4 476.0 436.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000dz 716.0 676.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000tm 579.0 539.0
http://transport.linkeddata.es/madrid/metro/stops/00000000000000000036 151.0 111.0
http://transport.linkeddata.es/madrid/metro/stops/0000000000000000006o 231.0 191.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000xj 739.0 699.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000gv 441.0 401.0
http://transport.linkeddata.es/madrid/metro/stops/000000000000000000lh 87.0 47.0
http://transport.linkeddata.es/madrid/metro/stops/0000000000000000007q 830.0 790.0


## Snapshot inspection

In [12]:
import pickle

In [13]:
with open('snapshot.pkl', 'rb') as f:
    snapshot = pickle.load(file=f)

In [18]:
snapshot['data/AGENCY.csv']

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url
0,1,1,http://www.crtm.es,1,1,1,https://www.crtm.es/billetes-y-tarifas
1,2,1,http://www.crtm.es,1,1,1,https://www.crtm.es/billetes-y-tarifasaa


### Old code (delete) 

In [None]:
def diff_dir(path: str, snapshot: str):
    """path: input data source path
    snapshot: path of snapshot
    """
    # load snapshot
    if os.path.exists(snapshot):
        new_version = False
        # snapshot exists
        with open(snapshot, 'rb') as f:
            sp = pickle.load(file=f)
    else:
        new_version = True
        # first version
        sp = dict()
    
    # data dir
    data_dir = os.fsencode(path) # TODO: quitar '/' si aparece al final
    # new data dir
    new_data_dir = os.fsencode(path + '_new')
    # create temp dir for new data
    if not os.path.exists(new_data_dir):
        os.makedirs(new_data_dir)

    for file in os.listdir(data_dir):
        filename = os.fsdecode(path + '/' + file.decode("ascii"))
        # read dataframes
        df_ds = pd.read_csv(filename, dtype=str) # source dataframe
        df_sp = pd.DataFrame() if new_version else sp[filename] # snapshot dataframe
        
        # find differences (assumes that new data is only in df_datasource)
        new_data = pd.concat([df_sp, df_ds]).drop_duplicates(keep=False)

        # save new data to new_data_dir
        new_file_path = os.fsdecode(path + '_new/' + file.decode("ascii"))
        new_data.to_csv(new_file_path, index=False)
        if len(new_data) == 0:
            print("No new data in %s, created file %s" % (file.decode('ascii'), new_file_path))
        else:
            print("Saved new data to %s" % (new_file_path))
        
        # save current snapshot = old + new
        sp[filename] = pd.concat([df_sp, new_data]) # should not have duplicates
    
    # save snaphsot
    with open(snapshot, 'wb') as f:
        pickle.dump(obj=sp, file=f)
        print("Saved snapshot to", snapshot)

#diff_dir('./data', 'snapshot.pkl')