In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pyspark.sql.functions as F
import setuptools
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from sklearn import linear_model, preprocessing
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline

load_dotenv()
spark = SparkSession.builder.remote(os.getenv('SPARK_HOST')).appName(os.getenv('SPARK_APP_NAME')).getOrCreate()
ROOT = os.getenv('WORKING_DIR') or '.'

In [2]:
df = spark.read.parquet(ROOT + 'data/tax_trafi_merged_data')
vtax = spark.read.parquet(ROOT + 'data/index_vehicle_tax_data.parquet')
prices = spark.read.parquet(ROOT + 'data/knn_imputed_prices')

In [3]:
df = df.join(vtax, on='index').join(prices, on='index').persist()

df.show(5)

+-----------+----------------------+--------------------------+----------------+-----+-----------+------------------+-----------+-----+-------+---------+--------+-------+----+---------------------+------------------------+---------+--------+---------+-----------+---------------+-----------------------+----------------+------------+---------------+-------------------+--------------+--------------------+------------+---------------+------------------+------------------+--------------------+---------------------+------------+--------+---------+--------+---------+--------+----------------+----------+---------+----+---------+---------------+-------------------------------+----------------+---------------+--------------------+-------------+-------+----+-------------+-----------+----------+----------+----------------+----+---------------------+------------------------------+--------+----------+------+-----------------+------------------+
|      index|vehicle_classification|date_of_first_regis

In [4]:
df = df.filter(F.col('vehicle_classification') == 'M1')\
    .withColumn('registration_year', F.year(F.col('date_of_first_registration')))\
    .withColumnRenamed('imputed_price', 'price')\
    .select('index', 'registration_year', 'n_doors', 'n_seats', 'body_type', 'drive_power', 'municipality', 'vtax', 'price')\
    .persist()

df.printSchema()
df.count()

root
 |-- index: long (nullable = true)
 |-- registration_year: integer (nullable = true)
 |-- n_doors: integer (nullable = true)
 |-- n_seats: integer (nullable = true)
 |-- body_type: string (nullable = true)
 |-- drive_power: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- vtax: double (nullable = true)
 |-- price: double (nullable = true)



np.int64(1447903)

In [5]:
# 01 gasoline, 04 electricity
df = df.filter((F.col('drive_power') == '01') | (F.col('drive_power') == '04'))
df.count()

np.int64(1039372)

In [6]:
df = df.withColumn('electric', F.when(F.col('drive_power') == '01', 0).otherwise(1))\
    .drop('drive_power')

df.printSchema()

root
 |-- index: long (nullable = true)
 |-- registration_year: integer (nullable = true)
 |-- n_doors: integer (nullable = true)
 |-- n_seats: integer (nullable = true)
 |-- body_type: string (nullable = true)
 |-- municipality: string (nullable = true)
 |-- vtax: double (nullable = true)
 |-- price: double (nullable = true)
 |-- electric: integer (nullable = false)



In [8]:
pddf = pd.DataFrame(df.collect(), columns=df.columns)
pddf.to_csv('./data/regression.csv')