In [18]:
import polars as pl
import duckdb
from pathlib import Path

# Handle paths

In [19]:
staging_tool_path = Path().absolute().parent.parent / "data" / "datalake" / "staging" / "vr_data"

vr_full_load_jsons = list(staging_tool_path.glob("**/*.json"))
print(len(list(vr_full_load_jsons)))

30


# Visualization on one loaded json file

In [20]:
polars_df = pl.read_json(vr_full_load_jsons[10])
polars_df.head(2)


trainNumber,departureDate,operatorUICCode,operatorShortCode,trainType,trainCategory,commuterLineID,runningCurrently,cancelled,version,timetableType,timetableAcceptanceDate,timeTableRows
i64,str,i64,str,str,str,str,bool,bool,i64,str,str,list[struct[16]]
1,"""2023-11-11""",10,"""vr""","""IC""","""Long-distance""","""""",False,False,286900629277,"""REGULAR""","""2023-07-05T11:…","[{""HKI"",1,""FI"",""DEPARTURE"",true,true,""6"",false,""2023-11-11T04:55:00.000Z"",null,null,""2023-11-11T04:55:46.000Z"",1,[],1,{""KUPLA"",true,""2023-11-11T04:51:42.000Z""}}, {""PSL"",10,""FI"",""ARRIVAL"",true,true,""4"",false,""2023-11-11T05:00:00.000Z"",null,null,""2023-11-11T05:00:11.000Z"",0,[],1,{null,null,null}}, … {""JNS"",460,""FI"",""ARRIVAL"",true,true,""1"",false,""2023-11-11T09:41:00.000Z"",""2023-11-11T09:38:17.000Z"",""LIIKE_AUTOMATIC"",""2023-11-11T09:36:09.000Z"",-5,[],1,{null,null,null}}]"
2,"""2023-11-11""",10,"""vr""","""S""","""Long-distance""","""""",False,False,286899850172,"""REGULAR""","""2023-07-05T11:…","[{""JNS"",460,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-11T02:58:00.000Z"",null,null,""2023-11-11T02:58:00.000Z"",0,[],2,{""KUPLA"",true,""2023-11-11T02:44:56.000Z""}}, {""PLT"",1070,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-11T02:58:42.000Z"",""2023-11-11T02:58:42.000Z"",""LIIKE_AUTOMATIC"",null,0,[],2,{null,null,null}}, … {""HKI"",1,""FI"",""ARRIVAL"",true,true,""10"",false,""2023-11-11T07:30:00.000Z"",null,null,""2023-11-11T07:30:44.000Z"",1,[],2,{null,null,null}}]"


# Creating Schema

In [21]:
SCHEMA = {
    "trainNumber": pl.Int64,
    "departureDate": pl.Utf8,
    "operatorUICCode": pl.Int64,
    "operatorShortCode": pl.Utf8,
    "trainType": pl.Utf8,
	"trainCategory": pl.Utf8,
	"commuterLineID": pl.Utf8,
    "runningCurrently": pl.Boolean,
	"cancelled": pl.Boolean,
    "deleted": pl.Boolean,
    "version": pl.Int64,
    "timetableType": pl.Utf8,
	"timetableAcceptanceDate": pl.Utf8,
    "timeTableRows": pl.List(
        pl.Struct({
            "stationShortCode": pl.Utf8,
            "stationUICCode": pl.Int64,
            "countryCode": pl.Utf8,
            "type": pl.Utf8,
            "trainStopping": pl.Boolean,
            "commercialStop": pl.Boolean,
            "commercialTrack": pl.Utf8,
            "cancelled": pl.Boolean,
            "scheduledTime": pl.Utf8,
            "liveEstimateTime": pl.Utf8,
            "estimateSource": pl.Utf8,
            "unknownDelay": pl.Boolean,
            "actualTime": pl.Utf8,
            "differenceInMinutes": pl.Int64,
            "causes": pl.List(
                pl.Struct(
                    {
                        "passengerTerm": pl.Utf8,
                        "categoryCode": pl.Utf8,
                        "categoryName": pl.Utf8,
                        "validFrom": pl.Utf8,
                        "validTo": pl.Utf8,
                        "id": pl.Int32,
                        "detailedCategoryCode": pl.Utf8,
                        "detailedCategoryName": pl.Utf8,
                        "thirdCategoryCode": pl.Utf8,
                        "thirdCategoryName": pl.Utf8,
                        "description": pl.Utf8,
                        "categoryCodeId": pl.Int32,
                        "detailedCategoryCodeId": pl.Int32,
                        "thirdCategoryCodeId": pl.Int32,
                    }
                )
            ),
            "trainReady": pl.List(
                pl.Struct(
                    {
                        "source": pl.Utf8,
                        "accepted": pl.Boolean,
                        "timestamp": pl.Utf8,   
                    }
                ),
            ),
        }),
    )
}


# Concat list of arrays to one dataframe

In [24]:
# Initialize an empty DataFrame
polars_df = pl.DataFrame()

# Read each JSON file and concatenate it to the existing DataFrame
for json_file in vr_full_load_jsons:
    df = pl.read_json(json_file, schema=SCHEMA)
    polars_df = pl.concat([polars_df, df])

# Now, polars_df contains all the data from the JSON files in a single DataFrame

In [25]:
#polars_df = pl.read_json(vr_full_load_jsons[0], schema=SCHEMA)
polars_df.tail(1)

trainNumber,departureDate,operatorUICCode,operatorShortCode,trainType,trainCategory,commuterLineID,runningCurrently,cancelled,deleted,version,timetableType,timetableAcceptanceDate,timeTableRows
i64,str,i64,str,str,str,str,bool,bool,bool,i64,str,str,list[struct[16]]
76045,"""2023-11-30""",9999,"""winco""","""TYO""","""On-track machi…","""""",False,False,,287050291234,"""ADHOC""","""2023-11-30T06:…","[{""RI"",40,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-30T06:36:00.000Z"",null,null,null,""2023-11-30T06:15:16.000Z"",-21,[],null}, {""ARP"",1235,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-30T06:40:30.000Z"",null,null,null,""2023-11-30T06:24:15.000Z"",-16,[],null}, … {""IKO"",62,""FI"",""ARRIVAL"",true,true,""2"",false,""2023-11-30T08:48:41.000Z"",null,null,null,""2023-11-30T07:58:52.000Z"",-50,[],null}]"


In [27]:
#lazy_df = pl.scan_ndjson(vr_full_load_jsons[0], schema=SCHEMA)
#print(lazy_df.limit(1))
# lazy_df.limit(3).collect()
lazy_df = polars_df
lazy_df.tail(1)

trainNumber,departureDate,operatorUICCode,operatorShortCode,trainType,trainCategory,commuterLineID,runningCurrently,cancelled,deleted,version,timetableType,timetableAcceptanceDate,timeTableRows
i64,str,i64,str,str,str,str,bool,bool,bool,i64,str,str,list[struct[16]]
76045,"""2023-11-30""",9999,"""winco""","""TYO""","""On-track machi…","""""",False,False,,287050291234,"""ADHOC""","""2023-11-30T06:…","[{""RI"",40,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-30T06:36:00.000Z"",null,null,null,""2023-11-30T06:15:16.000Z"",-21,[],null}, {""ARP"",1235,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-30T06:40:30.000Z"",null,null,null,""2023-11-30T06:24:15.000Z"",-16,[],null}, … {""IKO"",62,""FI"",""ARRIVAL"",true,true,""2"",false,""2023-11-30T08:48:41.000Z"",null,null,null,""2023-11-30T07:58:52.000Z"",-50,[],null}]"


# Write to Warehouse (DuckDB)

In [28]:
duckdb_path = Path().absolute().parent.parent / "data" / "warehouse" / "vr.duckdb"
duckdb_path.is_file()
conn = duckdb.connect(database=str(duckdb_path), read_only=False)

In [29]:
#conn.sql("SHOW")
conn.sql("SELECT * FROM lazy_df").pl()

trainNumber,departureDate,operatorUICCode,operatorShortCode,trainType,trainCategory,commuterLineID,runningCurrently,cancelled,deleted,version,timetableType,timetableAcceptanceDate,timeTableRows
i64,str,i64,str,str,str,str,bool,bool,bool,i64,str,str,list[struct[16]]
1,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286822491217,"""REGULAR""","""2023-07-05T11:…","[{""HKI"",1,""FI"",""DEPARTURE"",true,true,""10"",false,""2023-11-01T04:55:00.000Z"",null,null,null,""2023-11-01T04:55:21.000Z"",0,[],null}, {""PSL"",10,""FI"",""ARRIVAL"",true,true,""4"",false,""2023-11-01T05:00:00.000Z"",null,null,null,""2023-11-01T05:00:14.000Z"",0,[],null}, … {""JNS"",460,""FI"",""ARRIVAL"",true,true,""1"",false,""2023-11-01T09:41:00.000Z"",""2023-11-01T09:39:41.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T09:39:41.000Z"",-1,[],null}]"
2,"""2023-11-01""",10,"""vr""","""S""","""Long-distance""","""""",false,false,,286821604082,"""REGULAR""","""2023-07-05T11:…","[{""JNS"",460,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-01T02:58:00.000Z"",null,null,null,""2023-11-01T02:58:00.000Z"",0,[],null}, {""PLT"",1070,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-01T02:58:42.000Z"",""2023-11-01T02:58:42.000Z"",""LIIKE_AUTOMATIC"",null,null,0,[],null}, … {""HKI"",1,""FI"",""ARRIVAL"",true,true,""9"",false,""2023-11-01T07:30:00.000Z"",null,null,null,""2023-11-01T07:31:07.000Z"",1,[],null}]"
3,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286823761991,"""REGULAR""","""2023-07-05T11:…","[{""HKI"",1,""FI"",""DEPARTURE"",true,true,""6"",false,""2023-11-01T08:15:00.000Z"",null,null,null,""2023-11-01T08:15:49.000Z"",1,[],null}, {""PSL"",10,""FI"",""ARRIVAL"",true,true,""4"",false,""2023-11-01T08:20:00.000Z"",null,null,null,""2023-11-01T08:20:21.000Z"",0,[],null}, … {""JNS"",460,""FI"",""ARRIVAL"",true,true,""1"",false,""2023-11-01T12:51:00.000Z"",""2023-11-01T12:53:08.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T12:53:08.000Z"",2,[],null}]"
4,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286822663552,"""REGULAR""","""2023-07-05T11:…","[{""JNS"",460,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-01T03:58:00.000Z"",null,null,null,""2023-11-01T03:58:00.000Z"",0,[],null}, {""PLT"",1070,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-01T03:58:42.000Z"",null,null,null,null,null,[],null}, … {""HKI"",1,""FI"",""ARRIVAL"",true,true,""10"",false,""2023-11-01T08:40:00.000Z"",null,null,null,""2023-11-01T08:43:45.000Z"",4,[{null,""L"",null,null,null,null,""L2"",null,""L204"",null,null,352389194,34930952,67245313}],null}]"
5,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286824982323,"""REGULAR""","""2023-07-05T11:…","[{""HKI"",1,""FI"",""DEPARTURE"",true,true,""8"",false,""2023-11-01T11:15:00.000Z"",null,null,null,""2023-11-01T11:15:24.000Z"",0,[],null}, {""PSL"",10,""FI"",""ARRIVAL"",true,true,""4"",false,""2023-11-01T11:20:00.000Z"",null,null,null,""2023-11-01T11:19:54.000Z"",0,[],null}, … {""JNS"",460,""FI"",""ARRIVAL"",true,true,""1"",false,""2023-11-01T15:51:00.000Z"",""2023-11-01T15:51:00.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T15:51:41.000Z"",1,[],null}]"
6,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286823288189,"""REGULAR""","""2023-07-05T11:…","[{""JNS"",460,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-01T07:00:00.000Z"",""2023-11-01T07:00:00.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T07:00:00.000Z"",0,[],null}, {""PLT"",1070,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-01T07:00:42.000Z"",""2023-11-01T07:00:42.000Z"",""LIIKE_AUTOMATIC"",null,null,0,[],null}, … {""HKI"",1,""FI"",""ARRIVAL"",true,true,""10"",false,""2023-11-01T11:40:00.000Z"",null,null,null,""2023-11-01T11:42:54.000Z"",3,[],null}]"
7,"""2023-11-01""",10,"""vr""","""S""","""Long-distance""","""""",false,false,,286825745031,"""REGULAR""","""2023-07-05T11:…","[{""HKI"",1,""FI"",""DEPARTURE"",true,true,""8"",false,""2023-11-01T13:15:00.000Z"",null,null,null,""2023-11-01T13:15:53.000Z"",1,[],null}, {""PSL"",10,""FI"",""ARRIVAL"",true,true,""4"",false,""2023-11-01T13:20:00.000Z"",null,null,null,""2023-11-01T13:20:58.000Z"",1,[],null}, … {""JNS"",460,""FI"",""ARRIVAL"",true,true,""1"",false,""2023-11-01T17:38:00.000Z"",""2023-11-01T17:38:00.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T17:38:00.000Z"",0,[],null}]"
8,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286824613102,"""REGULAR""","""2023-07-05T11:…","[{""JNS"",460,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-01T10:11:00.000Z"",""2023-11-01T10:11:00.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T10:11:00.000Z"",0,[],null}, {""PLT"",1070,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-01T10:11:42.000Z"",""2023-11-01T10:11:42.000Z"",""LIIKE_AUTOMATIC"",null,null,0,[],null}, … {""HKI"",1,""FI"",""ARRIVAL"",true,true,""7"",false,""2023-11-01T14:40:00.000Z"",null,null,null,""2023-11-01T14:43:45.000Z"",4,[{null,""L"",null,null,null,null,""L3"",null,""L301"",null,null,352389194,51708168,16979201}],null}]"
9,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286826214328,"""REGULAR""","""2023-07-05T11:…","[{""HKI"",1,""FI"",""DEPARTURE"",true,true,""7"",false,""2023-11-01T14:15:00.000Z"",null,null,null,""2023-11-01T14:18:15.000Z"",3,[],null}, {""PSL"",10,""FI"",""ARRIVAL"",true,true,""4"",false,""2023-11-01T14:20:00.000Z"",null,null,null,""2023-11-01T14:22:31.000Z"",3,[],null}, … {""JNS"",460,""FI"",""ARRIVAL"",true,true,""1"",false,""2023-11-01T18:58:00.000Z"",""2023-11-01T18:59:56.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T18:59:56.000Z"",2,[],null}]"
10,"""2023-11-01""",10,"""vr""","""IC""","""Long-distance""","""""",false,false,,286825742543,"""REGULAR""","""2023-07-05T11:…","[{""JNS"",460,""FI"",""DEPARTURE"",true,true,""1"",false,""2023-11-01T13:11:00.000Z"",""2023-11-01T13:11:00.000Z"",""LIIKE_AUTOMATIC"",null,""2023-11-01T13:11:00.000Z"",0,[],null}, {""PLT"",1070,""FI"",""ARRIVAL"",false,null,"""",false,""2023-11-01T13:11:42.000Z"",""2023-11-01T13:11:42.000Z"",""LIIKE_AUTOMATIC"",null,null,0,[],null}, … {""HKI"",1,""FI"",""ARRIVAL"",true,true,""10"",false,""2023-11-01T17:45:00.000Z"",null,null,null,""2023-11-01T17:46:14.000Z"",1,[],null}]"


In [30]:
conn.sql("""
CREATE SCHEMA IF NOT EXISTS medallion_bronze;     
CREATE OR REPLACE TABLE vr.medallion_bronze.method_b_traintest AS (  
    WITH traintest_data AS (
        SELECT 
            md5(
                trainNumber || departureDate || operatorUICCode 
            ) as route_sk, --Surrogate Key
            * 
        FROM lazy_df
    )
            
    SELECT * FROM traintest_data
);
""")

conn.sql("""SELECT * FROM medallion_bronze.method_b_traintest LIMIT 1""")

┌──────────────────────┬─────────────┬───────────────┬───┬───────────────┬──────────────────────┬──────────────────────┐
│       route_sk       │ trainNumber │ departureDate │ … │ timetableType │ timetableAcceptanc…  │    timeTableRows     │
│       varchar        │    int64    │    varchar    │   │    varchar    │       varchar        │ struct(stationshor…  │
├──────────────────────┼─────────────┼───────────────┼───┼───────────────┼──────────────────────┼──────────────────────┤
│ 01ad4f7ec5ee7711ab…  │           1 │ 2023-11-01    │ … │ REGULAR       │ 2023-07-05T11:22:0…  │ [{'stationShortCod…  │
├──────────────────────┴─────────────┴───────────────┴───┴───────────────┴──────────────────────┴──────────────────────┤
│ 1 rows                                                                                          15 columns (6 shown) │
└──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [31]:
conn.close()