In [None]:
from vehicles import *

In [None]:
# the master file with all vehicles
df = read_file('rechev.csv')

In [None]:
# select the big makes
makes = df.groupby('make').size()
big_makes = makes[makes > 10000].index.tolist()

In [None]:
df_bak = df.copy()

In [None]:
df = df_bak.copy()

In [None]:
# add mileage data
# source: https://data.gov.il/dataset/shinui_mivne

mivne = read_file_helper(datadir + 'shinui_mivne.csv', enc='UTF-8')
stringify_cols(mivne, ['mispar_rechev'])

In [None]:
vo = pd.read_csv(datadir + 'vehicle_ownership.csv', sep='|')

stringify_cols(vo, ['mispar_rechev'])

c='baalut_dt'
vo[c] = pd.to_datetime(vo[c], format="%Y%m")

In [None]:
vo.baalut.value_counts()

In [None]:
dfm = pd.merge(df, mivne, how='left', on='mispar_rechev')
# find the popular models
dfm['makemodel'] = dfm['make'] + ' ' + dfm['kinuy_mishari']
degem_c = dfm['makemodel'].value_counts()
degem_big = degem_c[degem_c > 5000].index.to_list()

In [None]:
# source : https://data.gov.il/dataset/mehir_yevuan
# add model prices
fname = datadir + 'vehicle_cost.csv'
fd = open(fname, encoding='utf-8', errors='replace')
prices = pd.read_csv(fd, sep='|', low_memory=False)

stringify_cols(prices, ['degem_cd', 'tozeret_cd', 'shnat_yitzur'])
for c in ['mehir']:
    prices[c] = pd.to_numeric(prices[c], errors='coerce')

add_model(prices)
prices_ = prices.copy()
prices = prices[['model', 'mehir']]

In [None]:
dfm = dfm.merge(right=prices, on='model')

In [None]:
df.baalut.value_counts()

In [None]:
dfm['test_expiry_day'] = dfm['test_expiry'].dt.day
dfm['test_day'] = dfm['test'].dt.day
dfm['test_weekday'] = dfm['test'].dt.weekday
dfm['test_month'] = dfm['test'].dt.month

In [None]:
sns.histplot(data=dfm, x='test_expiry_day', fill=True, bins=31)
plt.xlabel(rev('יום תפוגת הרשיון'))
plt.ylabel(rev('ספירה'))

In [None]:
sns.histplot(data=dfm, x='test_day', fill=True, bins=31)
plt.xlabel(rev('יום הטסט בחודש'))
plt.ylabel(rev('ספירה'))

In [None]:
sns.histplot(data=dfm, x='test_weekday', fill=True, bins=7)

In [None]:
sns.histplot(data=dfm, x='test_month', fill=True, bins=12)
plt.xlabel(rev('חודש'))

In [None]:
import plotly.express as px

def plot_heatmap(df, c1, c2):
    fig = px.imshow(dfm.groupby([c1, c2]).size().unstack(), aspect='auto')
    fig.update_layout(
        xaxis_title='יום הטסט',
        yaxis_title='יום תפוגת הרשיון',
        coloraxis_colorbar=dict(
            title='Frequency',
            len=0.6,
            yanchor='middle',
            y=0.5
        ),
        coloraxis_cmin=0,
        coloraxis_cmax=dfm.groupby([c1, c2]).size().max(),
        coloraxis_colorscale='thermal',
        yaxis=dict(        autorange=True   )
    )
    fig.show()


In [None]:
plot_heatmap(dfm, 'test_expiry_day', 'test_day')

In [None]:
baalut = 'חברה'
plot_heatmap(dfm.query("baalut == @baalut"), 'test_expiry_day', 'test_day')

In [None]:
from scipy.stats import chisquare

def all_chi_squared(df, val_col, cond_col):
    # Compute the overall histogram of day of month
    overall_hist, _ = np.histogram(df[val_col].dt.day, bins=range(1, 32), density=True)
    
    # Compute the histogram of day of month for each make
    make_chi_squared = {}
    for make, group in df.groupby(cond_col):
        make_hist, _ = np.histogram(group[val_col].dt.day, bins=range(1, 32), density=True)
        
        # Compute the chi-squared distance to the overall average
        chi_squared_distance = chisquare(f_obs=make_hist, f_exp=overall_hist)[0]
        
        make_chi_squared[make] = chi_squared_distance
    
    # Sort the values of 'make' by the chi-squared distances
    sorted_make = sorted(make_chi_squared, key=make_chi_squared.get)
    
    return sorted_make


In [None]:
makes_chi_squared = all_chi_squared(dfm, 'test', 'make')

makes_chi_squared = [ m for m in makes_chi_squared if m in big_makes]

In [None]:
makes_chi_squared

In [None]:
make = 'טסלה'
#make = 'BYD'
#make = 'Geely'
#make = 'BMW'
dfm.query('make == @make')['test_expiry'].dt.day.hist(bins=31)

In [None]:
x=dfm.query('make == @make and test_expiry.dt.day == 31 and test.dt.day == 1')

In [None]:
dfm['date_difference'] = (dfm['test_expiry'] - dfm['test']).dt.days

In [None]:
# procrastinators by make
proc_make = dfm.query('make in @big_makes and year < 2020').groupby('make')['date_difference'].mean()

In [None]:
# procrastinators by model

proc_model = dfm.query('year < 2020 and year >= 2015').groupby('model').agg({'date_difference': ['mean','median','count']})
proc_model = proc_model.droplevel(1, axis=1)

proc_model.columns = ['diff_mean', 'diff_median', 'size']

proc_model = proc_model.merge(prices, on='model')

In [None]:
proc_model

In [None]:
import plotly.express as px

fig = px.scatter(proc_model.query("size > 200 and size < 1000"), x='diff_median', y='mehir', color='size')
fig.show()
