In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd
from datetime import *
import math

In [2]:
df = pd.read_parquet('./data/tax_trafi_merged_data')
print(df.shape)

(5373623, 63)


In [3]:
df = df.drop(columns = ['vehicle_subclass', 'usage', 'variant_uid', 'version_uid', 'date_of_use', 'color', 'n_doors', 'body_type','cab_type',
                      'n_seats', 'max_road_perm_laden_mass', 'length_mm', 'width_mm', 'height_mm',
                     'engine_capacity', 'max_net_engine_power_kw', 'sylintereidenLkm', 'supercharged', 'electric_hybrid', 'electric_hybrid_cat',
                      'make_plaintext', 'transmission', 'n_forward_gears', 'manufac_trade_name', 'brake_transmission', 'type_approval_number',
                      'driving_power_euro_vi', 'municipality', 'NEDC2_Co2', 'WLTP_Co2', 'WLTP2_Co2', 'odometer', 'serial_number_10',
                        'seq_number', 'condition_h_bad_n_normal_y_good', 'individual_properties', 'drivetrain','model_specifier','date_of_decision',
                        'body_style','tax.n_doors','tax.date_of_first_registration','odometer_unit_1000km','tax.transmission','tax.date_of_use'
                        ,'Cm3'])
print(df.shape)
#Co2 emission based calculations will be done according to NEDC standard ~60% of rows

(5373623, 17)


In [4]:
#remove registerations before 2011
df[df['date_of_first_registration'] > date(2011,1,1)]
# Separate the date to different columns
#df_e = D[D['drive_power'].str.contains('04') == True]
df.loc[:, 'reg_day'] = pd.to_datetime(df['date_of_first_registration']).dt.day
df.loc[:, 'reg_month'] =  pd.to_datetime(df.loc[:,'date_of_first_registration']).dt.month
df.loc[:, 'reg_year'] = pd.to_datetime(df.loc[:,'date_of_first_registration']).dt.year
#df = df.drop(columns = ['date_of_first_registration'])
df = df[df['date_of_first_registration'] > date(2011,1,1)]


In [5]:
#fill missing values of dates as zeros

df['reg_day'] = df['reg_day'].fillna(0)
df['reg_month'] = df['reg_month'].fillna(0)
df['reg_year'] = df['reg_year'].fillna(0)

df['reg_day'] = df['reg_day'].astype(int)
df['reg_month'] = df['reg_month'].astype(int)
df['reg_year'] = df['reg_year'].astype(int)

In [6]:
df = df.sort_values(by=['reg_year', 'reg_month', 'reg_day'], ascending = False)
df = df.reset_index()
df = df.drop(columns = ['level_0'])
#Select passanger cars for inspection 
df = df[df['vehicle_classification'] == 'M1']

In [7]:
#df[np.isnan(df['tax_rate']) == False].head(20)
co2_table = pd.read_csv('data/NEDC_tax.csv', sep = ';')
co2_table

Unnamed: 0,g/km,e/day,e/365day
0,0,0.146,53.29
1,1,0.147,53.65
2,2,0.148,54.02
3,3,0.149,54.38
4,4,0.150,54.75
...,...,...,...
395,395,1.783,650.79
396,396,1.785,651.52
397,397,1.787,652.25
398,398,1.789,652.98


In [8]:
#mask1 = ecars before 1.10.2021 mask2 = ecars after 1.10.2021, mask3 = Petrol cars, mask4 = Diesel cars, mask5 = Hybrids, mask6 = Others
mask1 = ((df['date_of_first_registration'] < date(2021, 9, 30)) & (df['drive_power'] == '04'))
mask2 = ((df['date_of_first_registration'] > date(2021, 9, 30)) & (df['drive_power'] == '04'))
mask3 = (df['drive_power'] == '01')
mask4 = (df['drive_power'] == '02')
mask5 = (df['drive_power'] == '39') | (df['drive_power'] == '44')
mask6 = (df['drive_power'] != '01') & (df['drive_power'] != '02') & (df['drive_power'] != '04') & (df['drive_power'] != '29') & (df['drive_power'] != '44')

In [9]:
df = df.join(co2_table.set_index('g/km'), on='NEDC_Co2')

In [10]:
df['e/365day'] = df['e/365day'].fillna(0)
#df = df.drop(columns = ['e/day'])

In [11]:

df['vtax_basic'] = 53.29 * mask1 + 118.26 * mask2 + df['e/365day'] * mask3 + df['e/365day'] * mask4 + df['e/365day'] * mask5# + df['e/365day'] * mask6
df['vtax_usepowertax'] = (0.015 * 365 * ((df['mass'].fillna(-1) / 100).apply(lambda x: math.ceil(x))) * (df['drive_power'] == '04') + 
                          0.000 * 365 * ((df['mass'].fillna(-1) / 100).apply(lambda x: math.ceil(x))) * (df['drive_power'] == '01') + 
                          0.055 * 365 * ((df['mass'].fillna(-1) / 100).apply(lambda x: math.ceil(x))) * (df['drive_power'] == '02') +
                          0.005 * 365 * ((df['mass'].fillna(-1) / 100).apply(lambda x: math.ceil(x))) * (df['drive_power'] == '39') +
                          0.049 * 365 * ((df['mass'].fillna(-1) / 100).apply(lambda x: math.ceil(x))) * (df['drive_power'] == '44')
                         )
df['vtax'] = df['vtax_basic'] + df['vtax_usepowertax']

In [12]:
df = df.drop(columns = ['NEDC_Co2', 'e/day', 'e/365day', 'vtax_basic', 'vtax_usepowertax', 'reg_year', 'reg_month', 'reg_day'])
df

Unnamed: 0,index,vehicle_classification,date_of_first_registration,mass,manufac_perm_max_mass,drive_power,model,tax_index,make,tax.model,taxable_value,car_tax,driving_power,Kw,tax_rate,pred_price,vtax
0,94489657054,M1,2024-06-30,1836.0,2200.0,04,Model 3 Sedan (AA) 4ov,,,,,,,,,,222.285
1,94489657077,M1,2024-06-30,1836.0,2200.0,04,Model 3 Sedan (AA) 4ov,,,,,,,,,,222.285
2,94489652805,M1,2024-06-30,1992.0,2456.0,04,Model Y Monikäyttöajoneuvo (AF) 5ov,,,,,,,,,,227.760
3,94489621644,M1,2024-06-30,2054.0,2518.0,04,Model Y Monikäyttöajoneuvo (AF) 5ov,,,,,,,,,,233.235
4,94489657067,M1,2024-06-30,1836.0,2200.0,04,Model 3 Sedan (AA) 4ov,,,,,,,,,,222.285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2677410,85899450334,M1,2011-01-03,1289.0,1740.0,01,i 30 Monikäyttöajoneuvo (AF) 4ov 1591cm3 A,,,,,,,,,,226.300
2677412,85899456785,M1,2011-01-03,1485.0,1960.0,01,AUDI A4 Sedan (AA) 4ov 1798cm3,,,,,,,,,,230.680
2677413,85899456832,M1,2011-01-03,1478.0,2022.0,01,ALTEA XL Monikäyttöajoneuvo (AF) 4ov 1390cm3,,,,,,,,,,206.220
2677414,85899456846,M1,2011-01-03,1240.0,1597.0,01,DS3 Viistoperä (AB) 2ov 1598cm3,,,,,,,,,,212.060


In [13]:
df = df[['index', 'vtax']]
df.head()

Unnamed: 0,index,vtax
0,94489657054,222.285
1,94489657077,222.285
2,94489652805,227.76
3,94489621644,233.235
4,94489657067,222.285


In [14]:
df.to_parquet('./data/index_vehicle_tax_data.parquet')