In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import statsmodels.api as sm

In [2]:
datadir = '../data/'

In [3]:
def _read_file(fname, enc='iso8859-8'):
    fd = open(fname, encoding=enc, errors='replace')
    df = pd.read_csv(fd, sep='|')
    return df

def add_model(df):
    df['model'] = df.apply(lambda x: '_'.join([x[y] for y in ['tozeret_cd', 'degem_cd', 'shnat_yitzur', 'sug_degem']]), axis=1)

def get_model_name(ns):
    names = pd.DataFrame(data={'model':ns})
    ret = pd.merge(names, models, how='left', on='model')[['tozeret_nm', 'kinuy_mishari']]
    return ret

In [4]:
def _convert_helper(df, oldcol, newcol, dictfile):
    # read dictionary
    filename=datadir + dictfile

    with open(filename, 'r') as f:
        lines = f.readlines()

    make_dict = []
    for line in lines:
        line = line.strip()
        items = line.split(',', 1)
        itm = items[0].strip()
        if len(items) > 1:
            make_dict.append((itm, items[1].strip()))
        else:
            make_dict.append((itm, itm))
    
    newdat = df[oldcol].copy()
    for (m_in, m_out) in make_dict:
        newdat[newdat.str.startswith(m_in)] = m_out
    df[newcol] = newdat
    
def convert_make(df, oldcol='tozeret_nm', newcol='make'):
    _convert_helper(df, oldcol, newcol, 'makes_dict.csv')

def convert_color(df, oldcol='tzeva_rechev', newcol='color'):
    _convert_helper(df, oldcol, newcol, 'color_dict.csv')

In [5]:
# source : https://data.gov.il/dataset/private-and-commercial-vehicles

#df = pd.read_csv('data/rechev-small.csv', encoding='iso-8859-1', sep='|')

def read_file(fname):
    fname = datadir + fname
    df = _read_file(fname)

    for c in ['mispar_rechev', 'degem_cd', 'tozeret_cd', 'shnat_yitzur']:
        df[c] = df[c].astype(str)

    df['test']= pd.to_datetime(df.mivchan_acharon_dt)
    df['test_expiry']= pd.to_datetime(df.tokef_dt)
    df['year'] = df['shnat_yitzur'].astype(int)
    c='moed_aliya_lakvish'
    df[c] = pd.to_datetime(df[c], format="%Y-%m")
    df['kvish_ym'] = df[c].dt.strftime('%Y%m')
    df['sidra'] = df['mispar_rechev'].apply(lambda k : k[-2:])
    add_model(df)
    convert_make(df)
    return df

In [6]:
# merge with the master file with all vehicles
df = read_file('rechev.csv')

convert_color(df)

df['metal'] = df['tzeva_rechev'].str.contains('מטאלי') | df['tzeva_rechev'].str.contains('מטלי')

In [7]:
palette = {
    'white': (255, 255, 255),
    'silver': (192, 192, 192),
    'grey': (128, 128, 128),
    'black': (0, 0, 0),
    'blue': (0, 0, 255),
    'beige': (245, 245, 220),
    'red': (255, 0, 0),
    'light blue': (173, 216, 230),
    'green': (0, 128, 0),
    'brown': (165, 42, 42),
    'bronze': (205, 127, 50),
    'gold': (255, 215, 0),
    'orange': (255, 165, 0),
    'eggshell': (252, 230, 201),
    'yellow': (255, 255, 0)
}

rgb_palette = {k: f"rgb{v}" for k, v in palette.items()}

color_names = {
    'לבן': 'white',
    'כסף': 'silver',
    'אפור': 'grey',
    'שחור': 'black',
    'כחול': 'blue',
    'בז': 'beige',
    'אדום': 'red',
    'תכלת': 'light blue',
    'ירוק': 'green',
    'חום': 'brown',
    'ברונזה': 'bronze',
    'זהב': 'gold',
    'כתום': 'orange',
    'קרם': 'eggshell',
    'צהוב': 'yellow'
}


heb_palette = {k: rgb_palette.get(v, None) for k, v in color_names.items()}


In [46]:
cl='חציל'
df[df['color'].str.contains(cl)]['kinuy_mishari'].value_counts().head(20)

MAZDA 2         1068
MAZDA 6          643
MAZDA 3          395
U5               319
MAZDA3           162
FOCUS            136
FIESTA           126
MAZDA3-SPORT      65
CADDY MAXI        64
MAZDA 5           51
MAZDA3 SPORT      28
GALAXY            24
MONDEO            23
CTS               21
S-MAX             19
MONDE0            19
IMPREZA           18
FORESTER          18
SONATA            11
XT5                9
Name: kinuy_mishari, dtype: int64

In [29]:
def show_colors(df, topN, title, exclude=[]):
    # Calculate color distribution by year
    color_counts = df.query('color not in @exclude').groupby(['year', 'color']).size().reset_index(name='count')
    color_counts['percent'] = color_counts.groupby('year')['count'].apply(lambda x: x / x.sum())

    # Only show the N largest colors by counts
    top_colors = color_counts.groupby('color')['count'].sum().nlargest(topN).index
    color_counts = color_counts[color_counts['color'].isin(top_colors)]

    # Map color names to color codes using the colormap
    color_counts['color_code'] = color_counts['color'].map(color_names)

    # Create stacked bar chart
    fig = px.bar(color_counts, x='year', y='percent', color='color', title=title,
             color_discrete_map=heb_palette, # Set discrete colors
             hover_data=['color', 'count'], # Show additional info on hover
             labels={'year': 'שנה', 'percent': 'שיעור', 'color': 'צבע'})
    fig.update_layout(barmode='stack') # Set stacked mode
    fig.update_yaxes(tickformat=".0%")
    fig.show()
    return color_counts

In [30]:
c=show_colors(df, 15, title="Color distribution by year")

In [36]:
exclude = ['לבן', 'שחור','כסף' ,'אפור']

In [37]:
c=show_colors(df, 15, title="Color distribution by year", exclude=exclude)

In [10]:
color_counts = df.groupby(['year', 'color']).size().reset_index(name='count')
color_counts['percent'] = color_counts.groupby('year')['count'].apply(lambda x: x / x.sum())
color_counts.query("year > 2013").groupby("color").sum().sort_values(by='count')

Unnamed: 0_level_0,year,count,percent
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
מלאנג מטאלי,2020,1,5e-06
מנדרינה מט,4030,2,8e-06
לא ידוע,8062,17,7.1e-05
חרדל,4029,20,8.9e-05
אדמדם מטאלי,14124,23,9.1e-05
טורקיז בהיר מטאלי,12112,27,0.000102
אחר,12109,30,0.000114
נחושת,20185,328,0.001468
רוז מתכתי,12112,358,0.001369
סהרה,20185,387,0.001528


In [11]:
private = "פרטי"
c=show_colors(df.query('baalut != @private'), 15, 'צבעי רכב בבעלות לא פרטית')
#c = show_colors(df.query('make == "BMW"'), 15)

In [12]:
fuel = "חשמל"
c=show_colors(df.query('year > 2016 and sug_delek_nm == @fuel'), 15, 'צבעי רכב חשמלי')

In [13]:
import numpy as np
from scipy.stats import entropy

def categorical_entropy(x):
    counts = np.unique(x, return_counts=True)[1]
    return entropy(counts)

In [14]:
entropy_df = df.groupby('make')['color'].apply(categorical_entropy).reset_index(name='entropy')

In [15]:
# select the big makes
makes = df.groupby('make').size()
big_makes = makes[makes > 10000].index.tolist()

In [16]:
entropy_df.query("entropy > 0 and make in @big_makes").sort_values(by='entropy', ascending=False)

Unnamed: 0,make,entropy
34,דייהטסו,2.065775
62,סוזוקי,2.039212
88,שברולט,2.012149
37,הונדה,1.962596
18,אופל,1.904063
54,מזדה,1.881787
61,סובארו,1.864871
63,סיאט,1.862905
67,סקודה,1.852915
44,יונדאי,1.828991


In [17]:
make = 'Geely'
c = show_colors(df.query('make == @make'), 15, make)

In [18]:
make = 'מרצדס'
c = show_colors(df.query('make == @make'), 15, make)

In [19]:
make = 'סוזוקי'
c = show_colors(df.query('make == @make'), 15, make)

In [20]:
df.query('make == @make')['year'].value_counts()

2017    16060
2018    13002
2014    12973
2016    12906
2019    12130
2015    11638
2010    10455
2021    10443
2011    10431
2008    10025
2013     9914
2020     9110
2022     8651
2012     8474
2009     5287
2023     4055
2007     3799
2006     2791
2005     1985
2004     1223
2002     1032
2003      832
2001      763
2000      442
1998      378
1999      365
1997      239
1996      120
Name: year, dtype: int64

In [21]:
df.year.value_counts()

2021    294977
2022    275542
2017    269203
2016    269137
2018    258682
2019    251181
2015    235158
2014    215421
2020    215277
2008    193776
2013    183125
2011    177434
2012    169531
2010    163916
2023    122641
2009    122041
2007     85617
2006     69763
2005     48511
2004     33996
2003     24167
2002     19046
2001     17961
2000      9738
1999      6929
1998      4501
1997      2222
1996       858
Name: year, dtype: int64

In [22]:
# metallic paints
metal = df.groupby('year').metal.mean().reset_index()

In [23]:
# create plot
fig = px.line(metal, x='year', y='metal', width=800, height=500)

# set line width and smoothing
fig.update_traces(line=dict(width=3, shape='spline', smoothing=1.3))

# set axis labels and title
fig.update_layout(xaxis_title='שנה', yaxis_title='שיעור צבע מטלי', title='צבע מטלי לפי זמן')
fig.update_yaxes(tickformat=".0%")

# show plot
fig.show()


In [24]:
df.groupby('make')['metal'].mean().sort_values()

make
Aiways       0.000000
למבורגיני    0.000000
מאן פולין    0.000000
טלקו הודו    0.000000
ואז          0.000000
               ...   
דייהטסו      0.411965
טויוטה       0.429303
רנו          0.430889
EVEASY       0.444444
קארמה        0.500000
Name: metal, Length: 89, dtype: float64

In [25]:
df[df.make == 'רנו']['tzeva_rechev'].value_counts()

שנהב לבן             48622
כסף מטלי             18312
אפור כהה מטלי        11409
שחור מטלי            10379
בז מטאלי              4374
קרם                   3446
אדום מטל              3045
כחול מטל              1516
אפור מטל              1171
כתום                  1155
כחול פחם מטלי          751
קפה מטאלי              348
ירוק מטל               307
רב גווני               284
חום                    249
כחול כהה               237
כחול                   227
צהוב                   175
אפור כחול מטלי         172
תכלת מטאלי             145
טורקיז מטאלי           115
אפור                   107
אפור כהה                77
תכלת                    71
טורקיז                  61
שחור                    60
כסף                     57
ירוק זהב מטלי           56
צהוב מטאלי              43
זהב מטאלי               43
כסף כחלחל מטלי          28
אדום                    22
בורדו מטל               19
זהוב                    18
אדום כהה (יין)          18
בורדו                   16
סגול כהה                14
י