# Import

In [35]:
import pandas as pd

# Data preview

In [36]:
df = pd.read_csv('data/car_evaluation.csv')
df

Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   buying_price       1728 non-null   object
 1   maintenance_cost   1728 non-null   object
 2   number_of_doors    1728 non-null   object
 3   number_of_persons  1728 non-null   object
 4   lug_boot           1728 non-null   object
 5   safety             1728 non-null   object
 6   decision           1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [38]:
df.describe()


Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [39]:
df.isnull().sum()

buying_price         0
maintenance_cost     0
number_of_doors      0
number_of_persons    0
lug_boot             0
safety               0
decision             0
dtype: int64

# Feature engineering

In [40]:
unique_values = {column: df[column].unique() for column in df.columns}
unique_values

{'buying_price': array(['vhigh', 'high', 'med', 'low'], dtype=object),
 'maintenance_cost': array(['vhigh', 'high', 'med', 'low'], dtype=object),
 'number_of_doors': array(['2', '3', '4', '5more'], dtype=object),
 'number_of_persons': array(['2', '4', 'more'], dtype=object),
 'lug_boot': array(['small', 'med', 'big'], dtype=object),
 'safety': array(['low', 'med', 'high'], dtype=object),
 'decision': array(['unacc', 'acc', 'vgood', 'good'], dtype=object)}

In [41]:
mappings = {
    'buying_price': {'vhigh': 3, 'high': 2, 'med': 1, 'low': 0},
    'maintenance_cost': {'vhigh': 3, 'high': 2, 'med': 1, 'low': 0},
    'number_of_doors': {'5more': '5'},
    'number_of_persons': {'more': '6'},
    'lug_boot': {'small': 0, 'med': 1, 'big': 2},
    'safety': {'low': 0, 'med': 1, 'high': 2},
    'decision': {'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3}
}

for column, mapping in mappings.items():
    df[column] = df[column].replace(mapping)

df

  df[column] = df[column].replace(mapping)


Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
0,3,3,2,2,0,0,0
1,3,3,2,2,0,1,0
2,3,3,2,2,0,2,0
3,3,3,2,2,1,0,0
4,3,3,2,2,1,1,0
...,...,...,...,...,...,...,...
1723,0,0,5,6,1,1,2
1724,0,0,5,6,1,2,3
1725,0,0,5,6,2,0,0
1726,0,0,5,6,2,1,2


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   buying_price       1728 non-null   int64 
 1   maintenance_cost   1728 non-null   int64 
 2   number_of_doors    1728 non-null   object
 3   number_of_persons  1728 non-null   object
 4   lug_boot           1728 non-null   int64 
 5   safety             1728 non-null   int64 
 6   decision           1728 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 94.6+ KB


In [43]:
df['number_of_doors'] = df['number_of_doors'].astype(int)
df['number_of_persons'] = df['number_of_persons'].astype(int)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   buying_price       1728 non-null   int64
 1   maintenance_cost   1728 non-null   int64
 2   number_of_doors    1728 non-null   int64
 3   number_of_persons  1728 non-null   int64
 4   lug_boot           1728 non-null   int64
 5   safety             1728 non-null   int64
 6   decision           1728 non-null   int64
dtypes: int64(7)
memory usage: 94.6 KB


In [45]:
df['decision'].value_counts()

decision
0    1210
1     384
2      69
3      65
Name: count, dtype: int64

# Find a representative row for each acceptability

In [73]:
unacceptable_cars = df[df['decision'] == 0]
acceptable_cars = df[df['decision'] == 1]
good_cars = df[df['decision'] == 2]
very_good_cars = df[df['decision'] == 3]

In [74]:
unacceptable_cars.describe()

Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
count,1210.0,1210.0,1210.0,1210.0,1210.0,1210.0,1210.0
mean,1.649587,1.633058,3.454545,3.580165,0.932231,0.752893,0.0
std,1.118099,1.127547,1.127152,1.671719,0.819754,0.802763,0.0
min,0.0,0.0,2.0,2.0,0.0,0.0,0.0
25%,1.0,1.0,2.0,2.0,0.0,0.0,0.0
50%,2.0,2.0,3.0,4.0,1.0,1.0,0.0
75%,3.0,3.0,4.0,6.0,2.0,1.0,0.0
max,3.0,3.0,5.0,6.0,2.0,2.0,0.0


In [47]:
acceptable_cars.describe()

Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
count,384.0,384.0,384.0,384.0,384.0,384.0,384.0
mean,1.424479,1.408854,3.585938,4.96875,1.101562,1.53125,1.0
std,1.041953,1.048199,1.094925,1.000816,0.799868,0.499674,0.0
min,0.0,0.0,2.0,4.0,0.0,1.0,1.0
25%,1.0,1.0,3.0,4.0,0.0,1.0,1.0
50%,1.0,1.0,4.0,4.0,1.0,2.0,1.0
75%,2.0,2.0,5.0,6.0,2.0,2.0,1.0
max,3.0,3.0,5.0,6.0,2.0,2.0,1.0


In [75]:
good_cars.describe()

Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
count,69.0,69.0,69.0,69.0,69.0,69.0,69.0
mean,0.333333,0.333333,3.565217,4.956522,1.043478,1.434783,2.0
std,0.474858,0.474858,1.104513,1.006374,0.812309,0.49936,0.0
min,0.0,0.0,2.0,4.0,0.0,1.0,2.0
25%,0.0,0.0,3.0,4.0,0.0,1.0,2.0
50%,0.0,0.0,4.0,4.0,1.0,1.0,2.0
75%,1.0,1.0,5.0,6.0,2.0,2.0,2.0
max,1.0,1.0,5.0,6.0,2.0,2.0,2.0


In [76]:
very_good_cars.describe()

Unnamed: 0,buying_price,maintenance_cost,number_of_doors,number_of_persons,lug_boot,safety,decision
count,65.0,65.0,65.0,65.0,65.0,65.0,65.0
mean,0.4,0.8,3.769231,5.076923,1.615385,2.0,3.0
std,0.49371,0.754155,1.057255,1.004796,0.49029,0.0,0.0
min,0.0,0.0,2.0,4.0,1.0,2.0,3.0
25%,0.0,0.0,3.0,4.0,1.0,2.0,3.0
50%,0.0,1.0,4.0,6.0,2.0,2.0,3.0
75%,1.0,1.0,5.0,6.0,2.0,2.0,3.0
max,1.0,2.0,5.0,6.0,2.0,2.0,3.0


In [77]:
unacceptable_medians = unacceptable_cars.describe().loc['50%']
acceptable_medians = acceptable_cars.describe().loc['50%']
good_medians = good_cars.describe().loc['50%']
very_good_medians = very_good_cars.describe().loc['50%']

In [89]:
unacceptable_medians

buying_price         2.0
maintenance_cost     2.0
number_of_doors      3.0
number_of_persons    4.0
lug_boot             1.0
safety               1.0
decision             0.0
Name: 50%, dtype: float64

In [90]:
acceptable_medians

buying_price         1.0
maintenance_cost     1.0
number_of_doors      4.0
number_of_persons    4.0
lug_boot             1.0
safety               2.0
decision             1.0
Name: 50%, dtype: float64

In [92]:
good_medians

buying_price         0.0
maintenance_cost     0.0
number_of_doors      4.0
number_of_persons    4.0
lug_boot             1.0
safety               1.0
decision             2.0
Name: 50%, dtype: float64

In [93]:
very_good_medians

buying_price         0.0
maintenance_cost     1.0
number_of_doors      4.0
number_of_persons    6.0
lug_boot             2.0
safety               2.0
decision             3.0
Name: 50%, dtype: float64

In [81]:
def get_distance_from_median(row, medians):
    return ((row - medians).abs()).sum()

In [83]:
unacceptable_cars['distance_from_median'] = unacceptable_cars.apply(lambda row: get_distance_from_median(row, unacceptable_medians), axis=1)
acceptable_cars['distance_from_median'] = acceptable_cars.apply(lambda row: get_distance_from_median(row, acceptable_medians), axis=1)
good_cars['distance_from_median'] = good_cars.apply(lambda row: get_distance_from_median(row, good_medians), axis=1)
very_good_cars['distance_from_median'] = very_good_cars.apply(lambda row: get_distance_from_median(row, very_good_medians), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unacceptable_cars['distance_from_median'] = unacceptable_cars.apply(lambda row: get_distance_from_median(row, unacceptable_medians), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acceptable_cars['distance_from_median'] = acceptable_cars.apply(lambda row: get_distance_from_median(row, acceptable_medians), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas

In [84]:
unacceptable_row = unacceptable_cars.loc[unacceptable_cars['distance_from_median'].idxmin()]
acceptable_row = acceptable_cars.loc[acceptable_cars['distance_from_median'].idxmin()]
good_row = good_cars.loc[good_cars['distance_from_median'].idxmin()]
very_good_row = very_good_cars.loc[very_good_cars['distance_from_median'].idxmin()]

In [85]:
unacceptable_row

buying_price            2.0
maintenance_cost        2.0
number_of_doors         3.0
number_of_persons       4.0
lug_boot                1.0
safety                  1.0
decision                0.0
distance_from_median    0.0
Name: 580, dtype: float64

In [86]:
acceptable_row

buying_price            2.0
maintenance_cost        1.0
number_of_doors         4.0
number_of_persons       4.0
lug_boot                1.0
safety                  2.0
decision                1.0
distance_from_median    1.0
Name: 716, dtype: float64

In [87]:
good_row

buying_price            0.0
maintenance_cost        0.0
number_of_doors         4.0
number_of_persons       4.0
lug_boot                1.0
safety                  1.0
decision                2.0
distance_from_median    0.0
Name: 1687, dtype: float64

In [88]:
very_good_row

buying_price            0.0
maintenance_cost        1.0
number_of_doors         4.0
number_of_persons       6.0
lug_boot                2.0
safety                  2.0
decision                3.0
distance_from_median    0.0
Name: 1592, dtype: float64