In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pyspark.sql.functions as F
import setuptools
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from sklearn import linear_model, preprocessing
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import PolynomialFeatures
from pyspark.sql.types import IntegerType

%matplotlib inline

load_dotenv()
spark = SparkSession.builder.remote(os.getenv('SPARK_HOST')).appName(os.getenv('SPARK_APP_NAME')).getOrCreate()
ROOT = os.getenv('WORKING_DIR') or '.'

In [2]:
df = spark.read.parquet(ROOT + 'data/tax_trafi_merged_data')
vtax = spark.read.parquet(ROOT + 'data/index_vehicle_tax_data.parquet')
prices = spark.read.parquet(ROOT + 'data/knn_imputed_prices')

In [None]:
regions = spark.read.options(encoding="ISO-8859-1", delimiter=";", header=True).csv(ROOT + 'data/regions.csv')
regions = regions.withColumn('municipality', F.substring('sourceCode', 2, 3))\
    .withColumn('region', F.substring('targetCode', 2, 2))\
    .select('municipality', 'region')
regions.show(5)

#regions.write.parquet(ROOT + 'data/cleaned_regions')


In [None]:
df = df.join(vtax, on='index').join(prices, on='index')
df = df.join(regions, on='municipality', how='left')

df.show(5)

In [None]:
df = df.filter(F.col('vehicle_classification') == 'M1')\
    .withColumn('registration_year', F.year(F.col('date_of_first_registration')))\
    .withColumnRenamed('imputed_price', 'price')\
    .select('index', 'registration_year', 'n_doors', 'n_seats', 'body_type', 'drive_power', 'municipality', 'region', 'vtax', 'price')

df.printSchema()
df.count()

In [None]:
# 01 gasoline, 04 electricity
df = df.filter((F.col('drive_power') == '01') | (F.col('drive_power') == '04'))
df.count()

In [None]:
df = df.withColumn('electric', F.when(F.col('drive_power') == '01', 0).otherwise(1))\
    .drop('drive_power')

df.printSchema()

In [None]:
df.show(5)

In [None]:
dens = spark.read.options(encoding="ISO-8859-1", delimiter=";", header=True).csv(ROOT + 'data/001_11ra_2023_20241028-183337.csv').withColumnRenamed('Area', 'name').drop('information')

kunta = spark.read.options(encoding="UTF-8", delimiter=";", header=True).csv(ROOT + 'data/kunta.csv')\
    .withColumnRenamed('classificationName', 'name').select('code', 'name')\
    .withColumn('code', F.substring('code', 2, 3))
kunta.show(5)
dens.show(5)

d2 = dens.join(kunta, on='name', how='left')
d2 = d2.withColumnRenamed('code', 'municipality')
d2.show(5)

In [None]:
from pyspark.sql.types import DateType, DoubleType

spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
fuel = spark.read.options(encoding="UTF-8", delimiter=";", header=True).csv(ROOT + 'data/Fuel_prices.csv')

fuel.show(5)

fuel = np.array(fuel.collect())[:, [0, 8]]

fuel = spark.createDataFrame(data=fuel)
fuel.show(5)

fuel = fuel.withColumn('date', F.to_date('_1', "yyyy'M'MM").cast(DateType()))\
    .withColumn('year', F.year('date')).withColumn('month', F.month('date'))\
    .select('year', 'month', '_2').withColumnRenamed('_2', 'fuelCost')\
    .withColumn('fuelCost', F.col('fuelCost').cast(DoubleType()))

fuel.show(5)
fuel.printSchema()

In [11]:
fuelyear = fuel.groupBy('year').mean('fuelCost').withColumnRenamed('avg(fuelCost)', 'fuelCost').withColumnRenamed('year', 'registration_year')
df = df.join(fuelyear, on='registration_year', how='left')

In [None]:
electricity = spark.read.options(encoding="ISO-8859-1", delimiter=";", header=True).csv(ROOT + 'data/001_13rb_2024m06_20241028-193343.csv')
electricity.show(5)

electricity = electricity.withColumn('date', F.to_date('Month', "yyyy'M'MM").cast(DateType()))\
    .withColumn('year', F.year('date')).withColumn('month', F.month('date'))\
    .withColumn('cents_kWh', F.col('Price (c/kWh)').cast(DoubleType()))\
    .groupBy('year').avg('cents_kWh').withColumnRenamed('avg(cents_kWh)', 'cents_kWh')\
    .select('year', 'cents_kWh')

electricity.show(5)
electricity.printSchema()

In [13]:
elecyear = electricity.withColumnRenamed('year', 'registration_year')
df = df.join(elecyear, on='registration_year', how='left')

In [14]:
income = pd.read_csv(ROOT + 'data/001_118w_2022_20241028-200329.csv', sep=';', encoding='latin-1').T
#income['info'] = income.index
income.reset_index(inplace=True)

income['year'] = income['index'].str.slice(0, 4)
income['index'] = income['index'].str.slice(5)
income = income.drop(0)
income.head()

income.to_csv(ROOT + 'data/median_income.csv', index=False, sep=';', encoding='latin-1')

# hh_med_income = spark.createDataFrame(income)
# hh_med_income.show(5)

#income['year'] = income['Information'].str.slice(0, 5)
#income.head()


In [None]:
hh_med_income = spark.read.options(encoding="ISO-8859-1", delimiter=";", header=True).csv(ROOT + 'data/median_income.csv')
hh_med_income.show(5)


In [None]:
hh_med_income = hh_med_income.withColumnRenamed('0', 'median_hh_income').withColumnRenamed('index', 'name').withColumn('year', F.col('year').cast('int'))
hh_med_income.show(5)

In [None]:
hh_med_income = kunta.join(hh_med_income, on='name', how='left')
hh_med_income.show(5)

In [None]:
hh_med_income = hh_med_income.withColumnRenamed('code', 'municipality').withColumn('hh_med_income', F.col('median_hh_income').cast('int')).withColumnRenamed('year', 'registration_year').select('municipality', 'registration_year', 'hh_med_income')
hh_med_income.show(5)
hh_med_income.printSchema()

In [None]:
df = df.join(hh_med_income, on=['registration_year', 'municipality'], how='left')
df.show(5)

In [21]:
for i, y in enumerate(range(2010, 2024)):
    ydata = d2.select(str(y), 'municipality').withColumnRenamed(str(y), 'population')
    result = df.filter(F.col('registration_year') == y)\
        .join(ydata, on='municipality', how='left')
    
    result.write.mode('overwrite').parquet(ROOT + f'data/preprocessed_regression_data_part_{i}')

In [22]:
un = None

for i, y in enumerate(range(2010, 2024)):
    if un == None:
        un = spark.read.parquet(ROOT + f'data/preprocessed_regression_data_part_{i}')
    else:
        un = un.union(spark.read.parquet(ROOT + f'data/preprocessed_regression_data_part_{i}'))

In [23]:
un.write.mode('overwrite').parquet(ROOT + 'data/preprocessed_regression_data')

In [24]:
pddf = pd.DataFrame(un.collect(), columns=un.columns)
pddf.head()
pddf.to_csv(ROOT + './data/regression.csv')