In [1]:
import morph_kgc

In [2]:
config = """
             [GTFS-Madrid-Bench]
             mappings: mapping.csv.ttl
         """

In [3]:
g = morph_kgc.materialize(config)

INFO | 2023-04-21 19:11:01,868 | Parallelization is not supported for win32 when running as a library. If you need to speed up your data integration pipeline, please run through the command line.
INFO | 2023-04-21 19:11:03,708 | 156 mapping rules retrieved.
INFO | 2023-04-21 19:11:03,766 | Mapping partition with 81 groups generated.
INFO | 2023-04-21 19:11:03,768 | Maximum number of rules within mapping group: 14.
INFO | 2023-04-21 19:11:03,768 | Mappings processed in 1.891 seconds.
INFO | 2023-04-21 19:11:04,789 | Number of triples generated in total: 2008.


## diff

In [7]:
import os
import pickle
import pandas as pd

def diff_dir(path: str, snapshot: str):
    """path: input data source path
    snapshot: path of snapshot
    """
    # current dir
    directory = os.fsencode(path)

    if os.path.exists(snapshot):
        # snapshot exists
        with open(snapshot, 'rb') as f:
            sp = pickle.load(file=f)
        # compare each file
        for file in os.listdir(directory):
            filename = path + '/' + os.fsdecode(file)
            df_ds = pd.read_csv(filename, dtype=str)
            df_sp = sp[filename]
            # find differences (assumes that new data is only in df_datasource)
            new_data = pd.concat([df_sp, df_ds]).drop_duplicates(keep=False)

            new_dir_path = path + '_new'            
            if not os.path.exists(new_dir_path):
                    os.makedirs(new_dir_path)

            # save newdata to _new dir
            new_file_path = new_dir_path + '/' + os.fsdecode(file)
            new_data.to_csv(new_file_path, index=False)
            if len(new_data) == 0:
                print("No new data in %s, created file %s" % (file.decode('ascii'), new_file_path))
            else:
                print("Saved new data to %s" % (new_file_path))
    else:
        # create snapshot
        sp = dict()
        for file in os.listdir(directory):
            filename = path + '/' + os.fsdecode(file)
            df = pd.read_csv(filename, dtype=str)
            sp[filename] = df
        # save snaphsot
        with open(snapshot, 'wb') as f:
            pickle.dump(obj=sp, file=f)

In [8]:
def diff_dir(path: str, snapshot: str):
    """path: input data source path
    snapshot: path of snapshot
    """
    # load snapshot
    if os.path.exists(snapshot):
        new_version = False
        # snapshot exists
        with open(snapshot, 'rb') as f:
            sp = pickle.load(file=f)
    else:
        new_version = True
        # first version
        sp = dict()
    
    # data dir
    data_dir = os.fsencode(path) # TODO: quitar '/' si aparece al final
    # new data dir
    new_data_dir = os.fsencode(path + '_new')
    # create temp dir for new data
    if not os.path.exists(new_data_dir):
        os.makedirs(new_data_dir)

    for file in os.listdir(data_dir):
        filename = os.fsdecode(path + '/' + file.decode("ascii"))
        # read dataframes
        df_ds = pd.read_csv(filename, dtype=str) # source dataframe
        df_sp = pd.DataFrame() if new_version else sp[filename] # snapshot dataframe
        
        # find differences (assumes that new data is only in df_datasource)
        new_data = pd.concat([df_sp, df_ds]).drop_duplicates(keep=False)

        # save new data to new_data_dir
        new_file_path = os.fsdecode(path + '_new/' + file.decode("ascii"))
        new_data.to_csv(new_file_path, index=False)
        if len(new_data) == 0:
            print("No new data in %s, created file %s" % (file.decode('ascii'), new_file_path))
        else:
            print("Saved new data to %s" % (new_file_path))
        
        # save current snapshot = old + new
        sp[filename] = pd.concat([df_sp, new_data]) # should not have duplicates
    
    # save snaphsot
    with open(snapshot, 'wb') as f:
        pickle.dump(obj=sp, file=f)
        print("Saved snapshot to", snapshot)

In [None]:
#diff_dir('./data', 'snapshot.pkl')

In [9]:
import rdflib
def load_kg(path: str, mapping: str, snapshot: str, old_graph: rdflib.Graph):
    """path: input data source (directory)
    mapping: path to mapping file
    snapshot: path to snapshot file
    old_graph: None or old version
    """
    diff_dir(path, snapshot)
    
    path_base_name = os.path.basename(path)
    new_path = path_base_name + '_new/'
    path_base_name += '/'
    
    with open(mapping, 'r') as f:
        mapping_lines = f.readlines()
        mapping_lines = [line.replace(path_base_name, new_path) for line in mapping_lines]
    
    new_mapping_file = '.new_' + mapping
    with open(new_mapping_file, 'w+') as f:
        f.writelines(mapping_lines)

    config = "[GTFS-Madrid-Bench]\nmappings: %s" % new_mapping_file
    new_graph = morph_kgc.materialize(config)

    # TODO: delete temp data dir?

    # return old_graph + new_graph
    return new_graph if old_graph is None else old_graph + new_graph

#### Create new graph

In [10]:
g = load_kg(path='./data', mapping='mapping.csv.ttl', snapshot='snapshot.pkl', old_graph=None)

INFO | 2023-04-21 16:04:58,762 | Parallelization is not supported for win32 when running as a library. If you need to speed up your data integration pipeline, please run through the command line.


Saved new data to ./data_new/AGENCY.csv
Saved new data to ./data_new/CALENDAR.csv
Saved new data to ./data_new/CALENDAR_DATES.csv
Saved new data to ./data_new/FEED_INFO.csv
Saved new data to ./data_new/FREQUENCIES.csv
Saved new data to ./data_new/ROUTES.csv
Saved new data to ./data_new/SHAPES.csv
Saved new data to ./data_new/STOPS.csv
Saved new data to ./data_new/STOP_TIMES.csv
Saved new data to ./data_new/TRIPS.csv
Saved snapshot to snapshot.pkl


INFO | 2023-04-21 16:04:59,995 | 156 mapping rules retrieved.
INFO | 2023-04-21 16:05:00,058 | Mapping partition with 81 groups generated.
INFO | 2023-04-21 16:05:00,059 | Maximum number of rules within mapping group: 14.
INFO | 2023-04-21 16:05:00,059 | Mappings processed in 1.296 seconds.
INFO | 2023-04-21 16:05:01,174 | Number of triples generated in total: 2001.


In [11]:
len(g)

2001

#### Add new triples

New total should be >= previous

In [12]:
g = load_kg(path='./data', mapping='mapping.csv.ttl', snapshot='snapshot.pkl', old_graph=g)

INFO | 2023-04-21 16:05:36,173 | Parallelization is not supported for win32 when running as a library. If you need to speed up your data integration pipeline, please run through the command line.


Saved new data to ./data_new/AGENCY.csv
No new data in CALENDAR.csv, created file ./data_new/CALENDAR.csv
No new data in CALENDAR_DATES.csv, created file ./data_new/CALENDAR_DATES.csv
No new data in FEED_INFO.csv, created file ./data_new/FEED_INFO.csv
No new data in FREQUENCIES.csv, created file ./data_new/FREQUENCIES.csv
No new data in ROUTES.csv, created file ./data_new/ROUTES.csv
No new data in SHAPES.csv, created file ./data_new/SHAPES.csv
No new data in STOPS.csv, created file ./data_new/STOPS.csv
No new data in STOP_TIMES.csv, created file ./data_new/STOP_TIMES.csv
No new data in TRIPS.csv, created file ./data_new/TRIPS.csv
Saved snapshot to snapshot.pkl


INFO | 2023-04-21 16:05:37,418 | 156 mapping rules retrieved.
INFO | 2023-04-21 16:05:37,480 | Mapping partition with 81 groups generated.
INFO | 2023-04-21 16:05:37,481 | Maximum number of rules within mapping group: 14.
INFO | 2023-04-21 16:05:37,482 | Mappings processed in 1.307 seconds.
INFO | 2023-04-21 16:05:38,492 | Number of triples generated in total: 7.


In [None]:
len(g)

#### Test query

In [None]:
q3 = """
         PREFIX gtfs: <http://vocab.gtfs.org/terms#>
         PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>
         PREFIX dct: <http://purl.org/dc/terms/>

         SELECT * WHERE {
             ?stop a gtfs:Stop . 
             ?stop gtfs:locationType ?location .
             OPTIONAL { ?stop dct:description ?stopDescription . }
             OPTIONAL { 
                 ?stop geo:lat ?stopLat . 
                 ?stop geo:long ?stopLong .
             }
             OPTIONAL {?stop gtfs:wheelchairAccessible ?wheelchairAccessible . }
             FILTER (?location=<http://transport.linkeddata.es/resource/LocationType/2>)
         }
      """

q3_res = g.query(q3)

for row in q3_res:
    print(row['stop'], row['stopLat'], row['stopLong'])

In [15]:
q3 = """
         PREFIX gtfs: <http://vocab.gtfs.org/terms#>

         SELECT ?agency ?url WHERE {
            ?agency a gtfs:Agency.
            ?agency gtfs:fareUrl ?url
         }
      """

q3_res = g.query(q3)

for row in q3_res:
    print(row['agency'], row['url'])

http://transport.linkeddata.es/madrid/agency/00000000000000000001 https://www.crtm.es/billetes-y-tarifas
http://transport.linkeddata.es/madrid/agency/00000000000000000002 https://www.crtm.es/billeaaaaaaaaaaaaaaa


## Snapshot inspection

In [13]:
with open('snapshot.pkl', 'rb') as f:
    snapshot = pickle.load(file=f)

In [14]:
snapshot['./data/AGENCY.csv']

Unnamed: 0,agency_id,agency_name,agency_url,agency_timezone,agency_lang,agency_phone,agency_fare_url
0,1,1,http://www.crtm.es,1,1,1,https://www.crtm.es/billetes-y-tarifas
1,2,3,http://www.crtm.es,1,1,1,https://www.crtm.es/billeaaaaaaaaaaaaaaa


TODO:

- leer mapping

In [None]:
df1 = pd.DataFrame({"a": [1, 2, 3, 9], "b": [4, 5, 6, 10], "c": [6, 7, 8, 11]})
df2 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [6, 7, 8]})
df3 = pd.DataFrame()

In [None]:
display(df1)
display(df2)
display(df3)

In [None]:
pd.concat([df1, df3])

In [None]:
# find differences
len(pd.concat([df1, df2]).drop_duplicates(keep=False))