In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import statsmodels.api as sm

In [2]:
datadir = '../data/'

In [3]:
def _read_file(fname, enc='iso8859-8'):
    fd = open(fname, encoding=enc, errors='replace')
    df = pd.read_csv(fd, sep='|')
    return df

def add_model(df):
    df['model'] = df.apply(lambda x: '_'.join([x[y] for y in ['tozeret_cd', 'degem_cd', 'shnat_yitzur', 'sug_degem']]), axis=1)

def get_model_name(ns):
    names = pd.DataFrame(data={'model':ns})
    ret = pd.merge(names, models, how='left', on='model')[['tozeret_nm', 'kinuy_mishari']]
    return ret

In [130]:
def convert_make(df, oldcol='tozeret_nm', newcol='make'):
    # read dictionary
    filename=datadir + 'makes_dict.csv'

    with open(filename, 'r') as f:
        lines = f.readlines()

    make_dict = []
    for line in lines:
        line = line.strip()
        items = line.split(',', 1)
        itm = items[0].strip()
        if len(items) > 1:
            make_dict.append((itm, items[1].strip()))
        else:
            make_dict.append((itm, itm))
    
    newdat = df[oldcol].copy()
    for (m_in, m_out) in make_dict:
        newdat[newdat.str.startswith(m_in)] = m_out
    df[newcol] = newdat

In [5]:
def trade_category(d):
    oc = d['ownership_count']
    months = d['months_to_first_trade']
    if math.isnan(oc):
        return 'ללא'
    if (oc == 1) & (months == 0):
        return 'מקורי'
    if months < 12:
        return 'נמכר תוך שנה'
    return 'נמכר תוך יותר משנה'

In [68]:
# source : https://data.gov.il/dataset/private-and-commercial-vehicles

#df = pd.read_csv('data/rechev-small.csv', encoding='iso-8859-1', sep='|')

def read_file(fname):
    fname = datadir + fname
    df = _read_file(fname)

    for c in ['mispar_rechev', 'degem_cd', 'tozeret_cd', 'shnat_yitzur']:
        df[c] = df[c].astype(str)

    df['test']= pd.to_datetime(df.mivchan_acharon_dt)
    df['test_expiry']= pd.to_datetime(df.tokef_dt)
    df['year'] = df['shnat_yitzur'].astype(int)
    c='moed_aliya_lakvish'
    df[c] = pd.to_datetime(df[c], format="%Y-%m")
    df['kvish_ym'] = df[c].dt.strftime('%Y%m')
    df['sidra'] = df['mispar_rechev'].apply(lambda k : k[-2:])
    add_model(df)
    convert_make(df)
    return df

In [12]:
# merge with the master file with all vehicles
df = read_file('rechev.csv')

In [15]:
df_bak = df.copy()
#df = df.dropna(subset=['baalut_dt'])

In [16]:
revert = False
if revert:
    df = df_bak.copy()
    revert = False

In [24]:
yearfuel = df.query("year >= 2020").groupby(['kvish_ym', 'sug_delek_nm']).size().reset_index().rename(columns={0 : "count"})

In [25]:
# Count market share over time, by fuel type
counts=df.query("year >= 2020").groupby('kvish_ym')['sug_delek_nm'].value_counts(normalize=True)
counts.name='share'
counts = counts.reset_index()

In [143]:
big_makes

יונדאי      131416
טויוטה      111824
קיה          93971
מזדה         51116
סקודה        49445
מיצובישי     36342
סיאט         34508
סוזוקי       32255
Name: make, dtype: int64

In [46]:
# add mileage data
# source: https://data.gov.il/dataset/shinui_mivne

mivne = _read_file(datadir + 'shinui_mivne.csv', enc='UTF-8')
for c in ['mispar_rechev']:
    mivne[c] = mivne[c].astype(str)

In [145]:
dfm = pd.merge(df, mivne, how='left', on='mispar_rechev')

In [147]:
c='kilometer_test_aharon'
dfm[c] = dfm[c].clip(upper=500000)
dfm['avg_km_day'] = dfm[c] / dfm['days_on_road']

In [120]:
#dfm[dfm['kilometer_test_aharon'] > 7e5][['tozeret_nm', 'kinuy_mishari', 'year', 'kilometer_test_aharon', 'baalut']]
#dfm[dfm['kilometer_test_aharon'] > 7e5]['make'].value_counts().to_dict()

In [148]:
# find the popular models
dfm['makemodel'] = dfm['make'] + ' ' + dfm['kinuy_mishari']
degem_c = dfm['makemodel'].value_counts()
degem_big = degem_c[degem_c > 5000].index.to_list()

In [151]:
mkm=dfm.query('makemodel in @degem_big')[['avg_km_day', 'makemodel']].groupby('makemodel').median()
mkm.sort_values(by='avg_km_day', ascending=False).head(50)

Unnamed: 0_level_0,avg_km_day
makemodel,Unnamed: 1_level_1
רנו FLUENCE,83.679916
רנו GRAND COUPE,77.15936
רנו KANGOO,74.056812
סקודה RAPID,69.572314
סקודה NEW OCTAVIA,67.531488
טויוטה AURIS HYBRID,67.219465
טויוטה PRIUS PLUS,66.35641
קיה CEED,62.835685
פולקסווגן JETTA,62.5698
טויוטה VERSO,61.070046


In [150]:
agg_own = dfm.query('year == 2019')[['kilometer_test_aharon', 'ownership_count_', 'make']].groupby('make').sum()
agg_own['avg_own_km'] = agg_own['kilometer_test_aharon'] / agg_own['ownership_count_']
agg_own.query("ownership_count_ > 1000").sort_values(by='avg_own_km')

Unnamed: 0_level_0,kilometer_test_aharon,ownership_count_,avg_own_km
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
מרצדס,151183600.0,8597.0,17585.621031
BMW,186284400.0,10155.0,18344.105662
קאדילאק,20799070.0,1016.0,20471.525591
אופל,45788080.0,2035.0,22500.285995
אאודי,198823700.0,8344.0,23828.346596
רובר,46668520.0,1928.0,24205.661826
שברולט,315579000.0,11738.0,26885.241779
מזדה,587543600.0,21761.0,26999.841965
לקסוס,82374600.0,2951.0,27914.132497
קרייזלר,76215750.0,2694.0,28290.924647


In [108]:
agg_own = dfm.query('year == 2019')[['kilometer_test_aharon', 'ownership_days', 'ownership_count_', 'make', 'sidra']].groupby('make').sum()
agg_own['avg_km_day'] = agg_own['kilometer_test_aharon'] / agg_own['ownership_days']
agg_own.query("ownership_count_ > 1000").sort_values(by='avg_km_day')

Unnamed: 0_level_0,kilometer_test_aharon,ownership_days,ownership_count_,avg_km_day
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
מרצדס,151183600.0,5147587.0,8597.0,29.369796
הונדה,212739600.0,6264860.0,7017.0,33.957596
פולקסווגן,154197000.0,4530011.0,4667.0,34.038989
קאדילאק,20799070.0,589873.3,1016.0,35.260234
אאודי,170677900.0,4824871.0,7353.0,35.374611
אופל,45788080.0,1292292.0,2035.0,35.431691
פיאט,32844400.0,918444.1,1127.0,35.760914
לקסוס,82374600.0,2281088.0,2951.0,36.111984
BMW,186284400.0,5129713.0,10155.0,36.314776
סוזוקי,647615500.0,17558740.0,20100.0,36.882805


In [59]:
mkm = dfm[['kilometer_test_aharon', 'sidra']].groupby('sidra').mean()

In [56]:
dfm[['kilometer_test_aharon', 'mispar_rechev']].sort_values(by='kilometer_test_aharon', ascending=False)

Unnamed: 0,kilometer_test_aharon,mispar_rechev
3324233,9660074.0,69165801
2769277,9620000.0,45437701
2030552,9503000.0,21658502
1957171,9124985.0,9124985
1749875,9123481.0,9123481
...,...,...
3556209,,93173101
3556210,,93173601
3556211,,93174101
3556212,,93174301
