In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def read_file(fname, enc='iso8859-8'):
    with open(fname, encoding=enc, errors='replace') as fd:
        df = pd.read_csv(fd, sep='|', low_memory=False)
    return df

def add_model(df):
    df['model'] = df.apply(lambda x: '_'.join([x[y] for y in ['tozeret_cd', 'degem_cd', 'shnat_yitzur', 'sug_degem']]), axis=1)

def add_model_name(df):
    df['model'] = df.apply(lambda x: f"{x['make']} {x['kinuy_mishari']}", axis=1)

def get_model_name(ns):
    names = pd.DataFrame(data={'model':ns})
    ret = pd.merge(names, models, how='left', on='model')[['tozeret_nm', 'kinuy_mishari']]
    return ret

def read_make_dict(datadir):
    # read dictionary
    filename= datadir + 'makes_dict.csv'

    with open(filename, 'r') as f:
        lines = f.readlines()

    make_dict = []
    for line in lines:
        line = line.strip()
        items = line.split(',', 1)
        itm = items[0].strip()
        if len(items) > 1:
            make_dict.append((itm, items[1].strip()))
        else:
            make_dict.append((itm, itm))

    return make_dict

def convert_make(df, make_dict, oldcol='tozeret_nm', newcol='make'):
    newdat = df[oldcol].copy()
    for (m_in, m_out) in make_dict:
        newdat[newdat.str.contains(m_in)] = m_out
    df[newcol] = newdat
    
def read_merged_data(datadir):
    # source : https://data.gov.il/dataset/private-and-commercial-vehicles/
    #   (if it doesn't download fully, try with Chrome)

    cars = read_file(datadir + 'rechev.csv')

    for c in ['mispar_rechev', 'degem_cd', 'tozeret_cd', 'shnat_yitzur']:
        cars[c] = cars[c].astype(str)

    make_dict = read_make_dict(datadir + '../')
    convert_make(cars, make_dict)
    add_model(cars)

    # source: https://data.gov.il/dataset/rechev-tag-nachim
    disabled = pd.read_csv(datadir + 'disabled-plates.csv', sep='|')
    disabled.columns = ['mispar_rechev', 'taarich_tag', 'sug_tag']
    for c in ['mispar_rechev']:
        disabled[c] = disabled[c].astype(str)

    # merge dataframes
    merged = pd.merge(cars, disabled, on='mispar_rechev', how='left', indicator=True)
    
    # create boolean column indicating if record exists in Y
    merged['disabled'] = merged['_merge'] == 'both'
    merged.drop(columns=['_merge'], inplace=True)
    # there are about 14,000 records which are only in the disabled set, not much to do about them
    
    df = merged
    
    # source : https://data.gov.il/dataset/mehir_yevuan
    # add model prices
    fname = datadir + 'vehicle_cost.csv'
    with open(fname, encoding='utf-8', errors='replace') as fd:
        prices = pd.read_csv(fd, sep='|', low_memory=False)
    
    for c in ['degem_cd', 'tozeret_cd', 'shnat_yitzur']:
        prices[c] = prices[c].astype(str)
    for c in ['mehir']:
        prices[c] = pd.to_numeric(prices[c], errors='coerce')
    
    add_model(prices)
    prices_ = prices.copy()
    prices = prices[['model', 'mehir']]
    df = pd.merge(left=df, right=prices, on='model')
    
    # read detailed tech specs on each model
    # source: https://data.gov.il/dataset/degem-rechev-wltp
    models = read_file(datadir + 'models.csv', enc='utf-8')
    #models.columns
    cols = ['tozeret_cd', 'tozeret_nm', 'shnat_yitzur',
           'degem_cd', 'nefah_manoa',
           'mishkal_kolel', 'gova',
           'hege_koah_ind',
           'automatic_ind',
           'koah_sus',
           'mispar_moshavim', 'kinuy_mishari', 'sug_degem',
           'hanaa_nm', 'merkav'
           ]
    models = models[cols].copy()
    for c in ['degem_cd', 'tozeret_cd', 'shnat_yitzur']:
        models[c] = models[c].astype(str)
    
    add_model(models)
    df_before = df.copy()
    merged = pd.merge(left=df_before, right=models.drop(columns=['kinuy_mishari', 'sug_degem', 'tozeret_nm']), on='model', how='left', indicator=True)
    merged.drop(columns=['_merge'], inplace=True)
    df=merged
    
    make_dict = read_make_dict(datadir + '../')
    convert_make(df, make_dict)
    add_model_name(df)
    
    return df

def group_by_disabled(df, group_col):
    return pd.crosstab(df[group_col], df['disabled'])

In [3]:

df23 = read_merged_data('../data/apr2023/')
df24 = read_merged_data('../data/dec2024/')

In [4]:
writer = pd.ExcelWriter('output.xlsx')


In [5]:
def prep_disabled(df, col = 'make'):
    dis = group_by_disabled(df, col)
    dis['total'] = dis[True] + dis[False]
    dis['prop_disabled'] = dis[True] / dis['total']
    return dis

def abs_log_ratio(a, b):
    return abs(log_ratio(a, b))

def log_ratio(a, b):
    if a == 0 or b == 0:
        return 0
    return (np.log10(a) - np.log10(b))

In [6]:

col='make'
dis23 = prep_disabled(df23)
dis24 = prep_disabled(df24)

make_merged = dis23.merge(right=dis24, how='outer', left_index=True, right_index=True, suffixes=('_2023', '_2024'))
make_merged['diff_disabled'] = make_merged.apply(lambda r: log_ratio(r['prop_disabled_2023'], r['prop_disabled_2024']), axis=1)

output = make_merged.query('total_2024 > 1000 and total_2023 > 1000').sort_values(by='diff_disabled', ascending=True)[['prop_disabled_2023', 'prop_disabled_2024', 'total_2023', 'total_2024', 'diff_disabled']]
output.to_excel(writer, sheet_name='disabled in make')
output


disabled,prop_disabled_2023,prop_disabled_2024,total_2023,total_2024,diff_disabled
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
צ'רי,0.092965,0.176446,5615.0,24319.0,-0.278292
סרס,0.126161,0.215412,1292.0,3361.0,-0.232345
טסלה,0.138396,0.205059,13064.0,24549.0,-0.170757
BYD,0.136639,0.190065,9075.0,34946.0,-0.143326
Aiways,0.156,0.215817,2000.0,2099.0,-0.140961
פורשה,0.120455,0.161853,1760.0,2008.0,-0.128296
Geely,0.140537,0.188048,10097.0,17788.0,-0.126479
סקיוול,0.173801,0.229075,1168.0,2043.0,-0.119924
אאודי,0.133372,0.173127,40136.0,41842.0,-0.113303
BMW,0.127559,0.165077,43282.0,46039.0,-0.111977


In [7]:

col='model'
dis23 = prep_disabled(df23, col=col)
dis24 = prep_disabled(df24, col=col)

model_merged = dis23.merge(right=dis24, how='outer', left_index=True, right_index=True, suffixes=('_2023', '_2024'))
model_merged['diff_disabled'] = model_merged.apply(lambda r: log_ratio(r['prop_disabled_2023'], r['prop_disabled_2024']), axis=1)

output = model_merged.query('total_2024 > 1000 and total_2023 > 1000').sort_values(by='diff_disabled', ascending=True)[['prop_disabled_2023', 'prop_disabled_2024', 'total_2023', 'total_2024', 'diff_disabled']]
output.to_excel(writer, sheet_name='disabled in model')
output


disabled,prop_disabled_2023,prop_disabled_2024,total_2023,total_2024,diff_disabled
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
צ'רי TIGGO 8 PRO,0.087673,0.176266,3399.0,10643.0,-0.303304
צ'רי FX,0.089438,0.154873,1174.0,4494.0,-0.238455
קיה BAYON,0.094414,0.159592,2023.0,4900.0,-0.227973
פולקסווגן CADDY,0.274066,0.453696,1766.0,2354.0,-0.218910
טויוטה AYGO X,0.080816,0.130106,2549.0,3966.0,-0.206800
...,...,...,...,...,...
פורד GALAXY,0.156505,0.136826,1591.0,1279.0,0.058362
פולקסווגן CADDY MAXI,0.514160,0.440476,2048.0,1932.0,0.067176
פולקסווגן CADDY KOMBI,0.259763,0.203810,1690.0,1575.0,0.105353
קיה COROLLA SDN HSD,0.229459,0.176985,5635.0,13826.0,0.112767


In [8]:
def prep_disabled_inv(df, col = 'make'):
    ret = pd.DataFrame(df[df['disabled']].groupby(col).size())
    ret.columns = ['num_disabled']
    ret['prop_from_disabled'] = ret['num_disabled'] / ret['num_disabled'].sum()
    return ret

In [9]:
disinv23 = prep_disabled_inv(df23)
disinv24 = prep_disabled_inv(df24)
make_merged = disinv23.merge(right=disinv24, how='outer', left_index=True, right_index=True, suffixes=('_2023', '_2024'))

make_merged['diff_disabled'] = make_merged.apply(lambda r: log_ratio(r['prop_from_disabled_2023'], r['prop_from_disabled_2024']), axis=1)

output = make_merged.query("num_disabled_2023 > 500 and num_disabled_2024 > 500").sort_values(by='diff_disabled', ascending=True)[['prop_from_disabled_2023', 'prop_from_disabled_2024', 'num_disabled_2023', 'num_disabled_2024']]
output.to_excel(writer, sheet_name='make in disabled')


In [10]:
disinv23 = prep_disabled_inv(df23, col='model')
disinv24 = prep_disabled_inv(df24, col='model')
model_merged = disinv23.merge(right=disinv24, how='outer', left_index=True, right_index=True, suffixes=('_2023', '_2024'))

model_merged['diff_disabled'] = model_merged.apply(lambda r: log_ratio(r['prop_from_disabled_2023'], r['prop_from_disabled_2024']), axis=1)

output = model_merged.query("(num_disabled_2023 > 500) and (num_disabled_2024 > 500)").sort_values(by='diff_disabled', ascending=True)[['prop_from_disabled_2023', 'prop_from_disabled_2024', 'num_disabled_2023', 'num_disabled_2024']]

output.to_excel(writer, sheet_name='model in disabled')


In [11]:
writer.close()

In [15]:
for v in [True, False]:
    print(df23['disabled'].value_counts(normalize=v))

False    0.860995
True     0.139005
Name: disabled, dtype: float64
False    3126296
True      504729
Name: disabled, dtype: int64


In [16]:
for v in [True, False]:
    print(df24['disabled'].value_counts(normalize=v))

False    0.830236
True     0.169764
Name: disabled, dtype: float64
False    3162437
True      646646
Name: disabled, dtype: int64
