In [1]:
import setuptools
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
from datetime import datetime, timedelta
from tqdm.notebook import trange, tqdm

load_dotenv()
SPARK_HOST = os.getenv('SPARK_HOST')
SPARK_APP_NAME = os.getenv('SPARK_APP_NAME')

spark = SparkSession.builder.remote(SPARK_HOST).appName(SPARK_APP_NAME).getOrCreate()

In [17]:
ROOT = os.getenv('WORKING_DIR')
trafidata = spark.read.parquet(ROOT + 'data/trafidata_with_indices')
taxprice = spark.read.parquet(ROOT + 'data/tax_data_with_price_predictions')
matches = spark.read.parquet(ROOT + 'data/tax_trafi_index_pairs')

In [18]:
matches = matches.withColumnRenamed('tax_index', 'index')
taxprice = taxprice.join(matches, on='index').withColumnRenamed('index', 'tax_index')

for col in taxprice.columns:
    if col in trafidata.columns:
        taxprice = taxprice.withColumnRenamed(col, f'tax.{col}')

taxprice = taxprice.withColumnRenamed('trafi_index', 'index')

trafidata = trafidata.join(taxprice, on='index', how='left')
trafidata.show(5)


+-----------+----------------------+--------------------------+----------------+-----+-----------+-----------+-----------+-----+-------+---------+--------+-------+----+---------------------+------------------------+---------+--------+---------+-----------+---------------+-----------------------+----------------+------------+---------------+-------------------+--------------+----------------+------------+---------------+------------------+------------------+--------------------+---------------------+------------+--------+---------+--------+---------+--------+----------------+----------+---------+----+---------+---------------+-------------------------------+----------------+---------------+--------------------+-------------+-------+----+-------------+-----------+----------+----------+----------------+----+---------------------+------------------------------+--------+----------+
|      index|vehicle_classification|date_of_first_registration|vehicle_subclass|usage|variant_uid|version_uid|

In [19]:
trafidata.printSchema()

root
 |-- index: long (nullable = true)
 |-- vehicle_classification: string (nullable = true)
 |-- date_of_first_registration: date (nullable = true)
 |-- vehicle_subclass: integer (nullable = true)
 |-- usage: integer (nullable = true)
 |-- variant_uid: string (nullable = true)
 |-- version_uid: string (nullable = true)
 |-- date_of_use: date (nullable = true)
 |-- color: string (nullable = true)
 |-- n_doors: integer (nullable = true)
 |-- body_type: string (nullable = true)
 |-- cab_type: integer (nullable = true)
 |-- n_seats: integer (nullable = true)
 |-- mass: integer (nullable = true)
 |-- manufac_perm_max_mass: integer (nullable = true)
 |-- max_road_perm_laden_mass: integer (nullable = true)
 |-- length_mm: integer (nullable = true)
 |-- width_mm: integer (nullable = true)
 |-- height_mm: integer (nullable = true)
 |-- drive_power: string (nullable = true)
 |-- engine_capacity: integer (nullable = true)
 |-- max_net_engine_power_kw: double (nullable = true)
 |-- sylintereiden

In [20]:
trafidata.write.options(encoding="ISO-8859-1", header=True, delimiter=";").parquet(ROOT + 'data/tax_trafi_merged_data')