In [1]:
import setuptools
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pandas as pd
from datetime import datetime, timedelta
from tqdm.notebook import trange, tqdm

load_dotenv()
SPARK_HOST = os.getenv('SPARK_HOST')
SPARK_APP_NAME = os.getenv('SPARK_APP_NAME')

spark = SparkSession.builder.remote(SPARK_HOST).appName(SPARK_APP_NAME).getOrCreate()

In [None]:
ROOT = os.getenv('WORKING_DIR')

df = spark.read.parquet(ROOT + 'data/tax_trafi_merged_data')
df.show(5)

In [None]:
df = df.filter(F.col('vehicle_classification') == 'M1').drop('vehicle_classification')
df.show(5)

In [None]:
rowcount = df.count()
rowcount

In [None]:
import numpy as np
resvec = np.zeros(len(df.columns))

for i, col in enumerate(df.columns):
    try:
        result = df.filter(F.col(col).isNull()).count()/rowcount
    except:
        result = None
    resvec[i] = result
    if result is not None:
        print(f'percentage {col} null {result}')
    else:
        print(f'percentage {col} null')

In [None]:
nulldf = pd.DataFrame(columns = df.columns)
nulldf.loc[0] = resvec
nulldf.head()

In [None]:
df2 = df.filter(F.col('date_of_first_registration') >= '2011-01-01')
rowcount = df2.count()

print(rowcount)

resvec = np.zeros(len(df2.columns))

for i, col in enumerate(df2.columns):
    try:
        result = df2.filter(F.col(col).isNull()).count() / rowcount
    except:
        result = None
    resvec[i] = result
    if result is not None:
        print(f'percentage {col} null {result}')
    else:
        print(f'percentage {col} null')


nulldf.loc[1] = resvec

In [None]:
pd.set_option('display.max_columns', 0)
nulldf.head()

for i, col in enumerate(nulldf.columns):
    val = nulldf.iloc[1, i]
    if val < 0.01:
        print(col, val)

In [None]:
df3 = df2.select('index', 'n_doors', 'n_seats', 'length_mm', 'width_mm', 'height_mm', 'drive_power', 'make_plaintext', 'model', 'transmission', 'manufac_trade_name', 'driving_power_euro_vi', 'pred_price')
df3 = df3.drop('pred_price').na.drop(how='any').join(df3.select('index', 'pred_price'), on='index', how='left')
df3.printSchema()

In [None]:
unique_makes = df3.select('make_plaintext').distinct().collect()
unique_makes = [r['make_plaintext'] for r in unique_makes]
unique_makes

In [None]:
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tqdm.notebook import trange, tqdm
from sklearn.linear_model import LinearRegression
from pyspark.sql.types import StructType, LongType, StringType, StructField, DoubleType

cols = ['index', 'n_doors', 'n_seats', 'width_mm', 'height_mm', 'length_mm', 'pred_price']
num_col = ['n_doors', 'n_seats', 'width_mm', 'height_mm', 'length_mm', 'pred_price']

schema = StructType([
    StructField('index', LongType(), True),
    StructField('imputed_price', DoubleType(), True)
])

resultsdf = spark.createDataFrame(data = [], schema=schema)

for make in tqdm(unique_makes):
    data_subset = df3.filter(F.col('make_plaintext') == make).select(cols)
    if data_subset.filter(F.col('pred_price').isNotNull()).count() == 0:
        continue

    data = np.array(data_subset.collect())
    pddf = pd.DataFrame(data, columns=cols)
    impute = pddf[num_col] # do not impute based on index

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('imputer', KNNImputer())
    ])

    transformed = pipe.fit_transform(impute)
    transformed = pipe.named_steps['scaler'].inverse_transform(transformed)
    results = list(zip(pddf['index'].values, transformed[:, -1]))
    resultsdf = resultsdf.union(spark.createDataFrame(results, schema=schema))

In [None]:
resultsdf.show(5)
resultsdf.write.parquet(ROOT + 'data/knn_imputed_prices')

In [None]:
# simplecols = pddf[num_col]

# pipe2 = Pipeline([
#     ('scaler', StandardScaler()),
#     ('imputer', KNNImputer()),
# ])

# simpleimputeresults = pipe2.fit_transform(simplecols)


In [None]:
df3.printSchema()
print(df3.count())

In [None]:
from sklearn.linear_model import LinearRegression
from pyspark.sql.types import StructType, LongType, StringType, StructField, DoubleType


cols = ['index', 'n_doors', 'n_seats', 'mass', 'length_mm', 'width_mm', 'height_mm', 'pred_price']
xcol = ['n_doors', 'n_seats', 'mass', 'length_mm', 'width_mm', 'height_mm']
ycol = ['pred_price']

resultcols = ['index', 'imputed_price']
resultsdf = pd.DataFrame()

schema = StructType([
    StructField('index', LongType(), True),
    StructField('imputed_price', DoubleType(), True)
])

resultsdf = spark.createDataFrame(data = [], schema=schema)

for make in unique_makes:
    data = np.array(df3.filter(F.col('make_plaintext') == make).select(cols).collect())
    pddf = pd.DataFrame(data, columns=cols)
    pddf.head()
    train = pddf[pddf['pred_price'].notna()]

    if train.shape[0] == 0:
        continue

    train.head()

    train_x = train[xcol]
    train_y = train['pred_price']

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pred', LinearRegression())
    ])

    pipe.fit(train_x, train_y)

    predict = pddf[~pddf['pred_price'].notna()]

    if predict.shape[0] == 0:
        continue

    predict_x = predict[xcol]
    result = pipe.predict(predict_x)

    resultsdf = resultsdf.union(spark.createDataFrame(zip(predict['index'], result), schema=schema))

resultsdf.count()

In [None]:
resultsdf.count()

In [None]:
resultsdf.show(5)

In [33]:
resultsdf.write.parquet(ROOT + 'data/kimputed_prices')