In [None]:
from vehicles import *

In [None]:
# the master file with all vehicles
df = read_file('rechev.csv')

In [None]:
df_bak = df.copy()

In [None]:
df = df_bak.copy()

In [None]:
def stringify_cols_delme(df, cols=None):
    if cols is None:
        cols = ['mispar_rechev', 'degem_cd', 'tozeret_cd', 'shnat_yitzur']
    for c in df.columns.intersection(cols):
        df[c] = df[c].astype(str)


In [None]:
# add mileage data
# source: https://data.gov.il/dataset/shinui_mivne  (first file)

mivne = read_file_helper(datadir + 'shinui_mivne.csv', enc='UTF-8')
stringify_cols(mivne, ['mispar_rechev'])

In [None]:
# source: https://data.gov.il/dataset/shinui_mivne  (second file)
vo = pd.read_csv(datadir + 'vehicle_ownership.csv', sep='|')

stringify_cols(vo, ['mispar_rechev'])

c='baalut_dt'
vo[c] = pd.to_datetime(vo[c], format="%Y%m")

In [None]:
vo.baalut.value_counts()

In [None]:
dfm = pd.merge(df, mivne, how='left', on='mispar_rechev')
# find the popular models
dfm['makemodel'] = dfm['make'] + ' ' + dfm['kinuy_mishari']
degem_c = dfm['makemodel'].value_counts()
degem_big = degem_c[degem_c > 5000].index.to_list()

In [None]:
import csv

# source : https://data.gov.il/dataset/mehir_yevuan
# add model prices

fname = datadir + 'vehicle_cost.csv'

with open(fname, encoding='utf-8', errors='replace') as fd:
    prices = pd.read_csv(fd, sep='|', low_memory=False, quoting=csv.QUOTE_NONE)

# there are unquoted quotes in the file, which is why we had csv.QUOTE_NONE above. Now let's remove the quote signs at the ends
for c in prices.columns:
    prices[c] = prices[c].map(lambda s: s.strip('"'))

stringify_cols(prices, ['degem_cd', 'tozeret_cd', 'shnat_yitzur'])
for c in ['mehir']:
    prices[c] = pd.to_numeric(prices[c], errors='coerce')

add_model(prices)
prices_ = prices.copy()
prices = prices[['model', 'mehir']]

In [None]:
# add prices
dfm['mehir'] = dfm['model'].map(prices.set_index('model')['mehir'].to_dict())

In [None]:
from datetime import datetime

# days_on_road is the time from first mile till the last "test" - a date in which we know the vehicle was functional
# ownership_days is the time from first mile till today
now = datetime.now()
dfm['days_on_road'] = ((dfm['test'] - dfm['moed_aliya_lakvish']).dt.total_seconds())/(24*60*60)
# clip days_on_road to at least 1 day
dfm['days_on_road'] = dfm['days_on_road'].clip(lower=1)
dfm['ownership_days'] = ((now - dfm['moed_aliya_lakvish']).dt.total_seconds())/(24*60*60)
# clip ownership_days to at least 1 day
dfm['ownership_days'] = dfm['ownership_days'].clip(lower=1)

# remove what looks like bad data - over 0.5M km total
c='kilometer_test_aharon'
dfm['km_test'] = dfm[c].copy()
dfm[c] = dfm[c].clip(upper=500000)

dfm['avg_km_day'] = dfm[c] / dfm['days_on_road']
dfm['avg_km_test_day'] = dfm['km_test'] / dfm['days_on_road']

# remove what looks like bad data - over 1000 km per day (maybe the test date is keyed in wrong)
c='avg_km_day'
dfm[c] = dfm[c].clip(upper=1000)

In [None]:
df.baalut.value_counts()

In [None]:
dfm.iloc[0].to_dict()

In [None]:
# 1. Find the mean price by make for rows with year = 2022
mean_price_2022 = dfm[dfm['year'] == 2022].groupby('make')['mehir'].mean().rename('price')

# 2. Find the total avg_km_day by make for rows with year = 2019
total_avg_km_2019 = dfm[dfm['year'] == 2019].groupby('make')['avg_km_day'].sum().rename('km')

In [None]:
counts = dfm['make'].value_counts()
#small_makes = ['וולבו', 'הונדה'] + ['פורד', 'פיאט']
small_makes = ['וולבו', 'הונדה']
for m in small_makes:
    print((m, counts[m]))

In [None]:
mean_price_2022.sort_values()[-15:].plot(kind='barh')

In [None]:
total_avg_km_2019.sort_values()[-15:].plot(kind='barh')

In [None]:
fines = pd.read_csv('../data/fines_and_mileage.csv')
fines=fines[['make', 'fines']]

In [None]:
fines = fines.merge(how='left', right=mean_price_2022, on='make')
fines = fines.merge(how='left', right=total_avg_km_2019, on='make')

In [None]:
fines.sort_values(by='km', ascending=False)

In [None]:
# select the big makes
makes = dfm.groupby('make').size()
big_makes = makes[makes > 10000].index.tolist()
by_price = fines.query("make in @big_makes").sort_values(by='price', ascending=False)[['make', 'price']]
by_price['price_k'] = (by_price['price'] / 1000).astype(int)
by_price

In [None]:
fines['fine_per_km'] = fines['fines'] / fines['km']
fines

In [None]:
fines_bak = fines.copy()
fines_filtered = fines[~fines['make'].isin(small_makes)]

In [None]:
import pandas as pd
import plotly.express as px

# Create the scatter plot
fig = px.scatter(fines_filtered, y='fine_per_km', x='price', hover_data=['make'], title='Scatter Plot of fines_per_km vs. price')

# Show the plot
fig.show()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

def rev(s):
    return s[::-1]

df = fines_filtered
# Create the scatter plot using Seaborn
plt.figure(figsize=(8, 6))
sns.scatterplot(x='price', y='fine_per_km', data=df, style='make', s=100, legend=False)
plt.xlabel(rev('מחיר'), fontsize=18)
plt.ylabel(rev('קנסות לק״מ'), fontsize=18)
#plt.title('Scatter Plot of fines_per_km vs. price')
#plt.legend(title='Make', markerscale=1.5)

# Annotate each point with the 'make' value
for x, y, make in zip(df['price'], df['fine_per_km'], df['make']):
    if make=='BMW':
        make = rev(make)  # don't reverse if in English
    plt.text(x+1e4,  y, rev(make), ha='left', va='center', fontweight='bold')

plt.show()


In [None]:
fines.sort_values(by='fine_per_km', ascending=False)

In [None]:
from scipy.stats import spearmanr
correlation, p_value = spearmanr(fines['fine_per_km'], fines['price'], nan_policy='omit')
print(correlation)
print(p_value)