# Cleaning Car Data

## Columns:

**Categorical Columns**  
- [] make_model
- [] body_type
- [] vat
- [] registration_year
- [] previous_owners
- [] type
- [] next_inspection
- [] inspection_new
- [] body_color
- [] paint_type
- [] upholstery_color
- [] upholstery_material
- [] gearing_type
- [] fuel_type
- [] particulate
- [] co2_emission
- [] emission_class
- [] drive_chain
- [] consumption_country
- [] consumption_city
- [] consumption_combined
- [] entertainment_media
- [] safety_security
- [] comfort_convenience
- [] extras
 
**Quantitative Columns**
- [] price
- [] km
- [] hp
- [] displacement
- [] warranty
- [] weight
- [] nr_of_doors
- [] nr_of_seats
- [] cylinders
- [] gears


---

In [912]:
import pandas as pd
import numpy as np

df = pd.read_csv("cleaned_car_data.csv")

In [913]:
df.head(3).T

Unnamed: 0,0,1,2
make_model,Audi A1,Audi A1,Audi A1
body_type,Sedans,Sedans,Sedans
price,15770,14500,14640
vat,VAT deductible,Price negotiable,VAT deductible
km,56013.0,80000.0,83450.0
hp,66.0,141.0,85.0
type,Used,Used,Used
previous_owners,2.0,,1.0
next_inspection,2021.0,,
inspection_new,Yes,,


In [914]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 42 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   make_model            15919 non-null  object 
 1   body_type             15859 non-null  object 
 2   price                 15919 non-null  int64  
 3   vat                   11406 non-null  object 
 4   km                    14895 non-null  float64
 5   hp                    15831 non-null  float64
 6   type                  15917 non-null  object 
 7   previous_owners       9254 non-null   float64
 8   next_inspection       2825 non-null   float64
 9   inspection_new        3570 non-null   object 
 10  warranty              4853 non-null   float64
 11  full_service          8215 non-null   object 
 12  non-smoking_vehicle   7177 non-null   object 
 13  null                  15919 non-null  object 
 14  offer_number          12744 non-null  object 
 15  first_registration 

---

## Fill Missing Values Function


### use q cut on numerical to turn into categorical.
### also use other methods

In [915]:
def fill(method, df, column, group_cols=None):
    """
    Fills NaN values in `df[column]` either using the overall mean, median or mode (no grouping)
    or group-specific mean, median or mode (group_col provided).
    Prints stats about how many NaNs were filled and the final distribution.
    """
    # Debug prints: which column is being filled, and grouping info.
    print('Filling column:', column)
    print('Grouping by:', group_cols)

    # 1. Count NaNs before filling
    nan_before = df[column].isnull().sum()

    # 2. Fill logic
    if method == 'mean':
            if group_cols is None:
                # Fill all NaN in 'column' with the overall mean
                df[column].fillna(df[column].mean(), inplace=True)
            else:
                # Calculate groupwise mean for each row
                group = df.groupby(group_cols)[column].transform(lambda x: x.mean())
                # Fill missing values in df[column] with corresponding group mean
                df[column].fillna(group, inplace=True)
    
    elif method == 'median':
        if group_cols is None:
            # Fill all NaN in 'column' with the overall median
            df[column].fillna(df[column].median(), inplace=True)
        else:
            # Calculate groupwise median for each row
            group = df.groupby(group_cols)[column].transform(lambda x: x.median())
            # Fill xmissing values in df[column] with corresponding group median
            df[column].fillna(group, inplace=True)
    
    elif method == 'mode':
        if group_cols is None:
            # Fill all NaN in 'column' with the overall mode
            df[column].fillna(df[column].mode().iloc[0], inplace=True)            
        else:
            # Calculate groupwise mode for each row
            group = df.groupby(group_cols)[column]\
            .transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)
            # Fill missing values in df[column] with corresponding group mode
            df[column].fillna(group, inplace=True)

    # 3. Count NaNs after filling
    nan_after = df[column].isnull().sum()
    nan_filled = nan_before - nan_after

    # 4. Print final stats
    print("Number of NaN before filling:", nan_before)
    print("Number of NaN filled:", nan_filled)
    print("Number of NaN after filling:", nan_after)
    print("------------------")
    print(df[column].value_counts(dropna=False))

---

## Provided Functions

In [916]:
def fill_most_freq(df, group_col, col_name):
    
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != [] :
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ", df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [917]:
def fill_prop(df, group_col, col_name):
    
    '''Fills the missing values with "ffill and bfill method" according to single-stage grouping'''
    
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ", df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [918]:
def double_stage(df, group_col1, group_col2, col_name, method): # method can be either "mode" or "mean" or "median" or "ffill"
    
    '''Fills the missing values with "mode/mean/median/ffill/bfill method" according to double-stage grouping'''
    
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != [] :
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != [] :
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])

    elif method == "mean":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df[col_name].mean(), inplace = True)
        
    elif method == "median":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df[col_name].median(), inplace = True)
        
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

---

## Categorical Columns
- [x] make_model
- [x] body_type
- [x] vat
- [x] registration_year
- [x] previous_owners
- [] type
- [x] next_inspection
- [x] inspection_new
- [x] body_color
- [x] paint_type
- [] upholstery_color
- [] upholstery_material
- [x] gearing_type
- [] fuel_type
- [] particulate
- [] co2_emission
- [] emission_class
- [] drive_chain
- [] consumption_country
- [] consumption_city
- [] consumption_combined
- [] entertainment_media
- [] safety_security
- [] comfort_convenience
- [] extras

### make_model

In [919]:
df['make_model'].value_counts(dropna=False)

make_model
Audi A3           3097
Audi A1           2614
Opel Insignia     2598
Opel Astra        2526
Opel Corsa        2219
Renault Clio      1839
Renault Espace     991
Renault Duster      34
Audi A2              1
Name: count, dtype: int64

### body_type

In [920]:
df['body_type'].value_counts(dropna=False)

body_type
Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: count, dtype: int64

In [921]:
df['make_model'].value_counts(dropna=False)

make_model
Audi A3           3097
Audi A1           2614
Opel Insignia     2598
Opel Astra        2526
Opel Corsa        2219
Renault Clio      1839
Renault Espace     991
Renault Duster      34
Audi A2              1
Name: count, dtype: int64

#### same make_model same body_type so we can use fill()

In [922]:
fill('mode', df, 'body_type', 'make_model')

Filling column: body_type
Grouping by: make_model
Number of NaN before filling: 60
Number of NaN filled: 60
Number of NaN after filling: 0
------------------
body_type
Sedans           7925
Station wagon    3563
Compact          3155
Van               809
Other             290
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: count, dtype: int64


### vat

In [923]:
df.vat.value_counts(dropna=False)

vat
VAT deductible      10980
NaN                  4513
Price negotiable      426
Name: count, dtype: int64

In [924]:
df['vat'] = df['vat'].fillna('VAT undeductible') 

* All null values of vat column assigned as 'VAT undeductible'

### registration_year (converted to age)

In [925]:
df.loc[((df['type'] == 'New') & df.registration_year.isnull()), 'registration_year']= 2019

 - #### new cars become 2019

In [926]:
df.groupby('registration_year').km.mean()

registration_year
2016.0    77442.520958
2017.0    41754.940709
2018.0    18035.239072
2019.0     1653.107634
Name: km, dtype: float64

In [927]:
df.loc[(df['registration_year'].isnull()) & (df.km <= 5000),'registration_year'] =2019

In [928]:
df.loc[(df['registration_year'].isnull()) & (df.km > 50000),'registration_year'] =2016

In [929]:
df.loc[(df['registration_year'].isnull()) & (df.km < 19000),'registration_year'] =2018

In [930]:
df.loc[(df['registration_year'].isnull()) & (df.km < 40000),'registration_year'] =2017

- #### otherwise filled based on mileage

In [931]:
df[(df['make_model'] == 'Audi A3') & (df['body_type'] == 'Sedans')].groupby('registration_year').price.mean()

registration_year
2016.0    16702.052388
2017.0    19970.030631
2018.0    22544.715092
2019.0    24859.973913
Name: price, dtype: float64

In [932]:
df[(df['make_model'] == 'Opel Insignia')].groupby('registration_year').price.mean()

registration_year
2016.0    13606.709507
2017.0    16995.140917
2018.0    21390.167750
2019.0    32103.197015
Name: price, dtype: float64

In [933]:
df.loc[(df['registration_year'].isnull())]

Unnamed: 0,make_model,body_type,price,vat,km,hp,type,previous_owners,next_inspection,inspection_new,...,safety_security,gears,registration_year,upholstery_material,upholstery_color,fuel_type,particulate,combined_consumption,city_consumption,country_consumption
5237,Audi A3,Sedans,25400,VAT undeductible,,85.0,,,,,...,"['ABS', 'Central door lock', 'Driver-side airb...",7.0,,Cloth,,,unparticulate,3.9,4.1,3.7
5329,Audi A3,Sedans,24900,VAT undeductible,,85.0,Pre-registered,,,,...,"['ABS', 'Central door lock', 'Driver-side airb...",7.0,,Cloth,,-,unparticulate,3.9,4.1,3.7
12550,Opel Insignia,Sedans,33800,VAT undeductible,,100.0,Employee's car,,,,...,"['ABS', 'Blind spot monitor', 'Central door lo...",6.0,,Part leather,Black,l,unparticulate,5.4,7.0,4.6
12882,Opel Insignia,Station wagon,31318,VAT deductible,,100.0,Used,,,,...,"['ABS', 'Central door lock', 'Driver-side airb...",6.0,,Full leather,Black,d,unparticulate,4.3,,


In [934]:
df['registration_year']=df['registration_year'].fillna(2019)

In [935]:
df['registration_year'] = 2019 - df['registration_year']

df = df.rename(columns = {'registration_year': 'age'})

In [936]:
df['age'].value_counts(dropna=False)

age
1.0    4525
0.0    4434
3.0    3679
2.0    3281
Name: count, dtype: int64

- #### year column converted to age column

### previous owners

In [937]:
index=df[df["km"]<10].index

In [938]:
df.loc[index,"previous_owners"]=0.0

In [939]:
df.loc[index,"previous_owners"]=0.0

In [940]:
df.loc[(df["type"]=="New") & (df["previous_owners"].isnull()),"previous_owners"]=1.0

In [941]:
df.loc[((df["age"]==0) | (df["km"]<5000)) & (df["previous_owners"].isnull()),"previous_owners"]=1.0

In [942]:
mode_group_po = df.groupby(['age'])['previous_owners'].transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)

In [943]:
df["previous_owners"].fillna(mode_group_po, inplace=True)

In [944]:
df["previous_owners"].value_counts(dropna=False)

previous_owners
1.0    14287
0.0      835
2.0      778
3.0       17
4.0        2
Name: count, dtype: int64

### type

In [945]:
df['type'].value_counts(dropna=False)

type
Used              11096
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
NaN                   2
Name: count, dtype: int64

#### Only 2 NaN so replaced with mode

In [946]:
fill('mode', df, 'type')

Filling column: type
Grouping by: None
Number of NaN before filling: 2
Number of NaN filled: 2
Number of NaN after filling: 0
------------------
type
Used              11098
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
Name: count, dtype: int64


### next_inspection (dropped)

In [947]:
df['next_inspection'].value_counts(dropna=False)

next_inspection
NaN       13094
2021.0     1401
2020.0      557
2022.0      483
2019.0      336
2018.0       26
2017.0        7
2023.0        5
2001.0        5
2016.0        3
2014.0        1
1921.0        1
Name: count, dtype: int64

In [948]:
df.drop(["next_inspection"],axis=1,inplace=True)

### inspection_new

In [949]:
df['inspection_new'].fillna('No', inplace=True)

In [950]:
df['inspection_new'].value_counts(dropna=False)

inspection_new
No     12349
Yes     3570
Name: count, dtype: int64

### body_color

In [951]:
df['body_color'].value_counts(dropna=False)

body_color
Black     3745
Grey      3505
White     3406
Silver    1647
Blue      1431
Red        957
NaN        597
Brown      289
Green      154
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: count, dtype: int64

In [952]:
fill('mode', df, 'body_color', ['make_model', 'body_type'])

Filling column: body_color
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 597
Number of NaN filled: 594
Number of NaN after filling: 3
------------------
body_color
Black     3941
Grey      3836
White     3469
Silver    1647
Blue      1431
Red        957
Brown      289
Green      158
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
NaN          3
Gold         2
Name: count, dtype: int64


In [953]:
fill('mode', df, 'body_color', ['make_model'])

Filling column: body_color
Grouping by: ['make_model']
Number of NaN before filling: 3
Number of NaN filled: 3
Number of NaN after filling: 0
------------------
body_color
Black     3941
Grey      3839
White     3469
Silver    1647
Blue      1431
Red        957
Brown      289
Green      158
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: count, dtype: int64


### paint_type

In [954]:
fill('mode', df, 'paint_type', ['make_model', 'body_type'])

Filling column: paint_type
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 5772
Number of NaN filled: 5761
Number of NaN after filling: 11
------------------
paint_type
Metallic       15512
Uni/basic        390
NaN               11
Perl effect        6
Name: count, dtype: int64


In [955]:
fill('mode', df, 'paint_type', 'make_model')

Filling column: paint_type
Grouping by: make_model
Number of NaN before filling: 11
Number of NaN filled: 11
Number of NaN after filling: 0
------------------
paint_type
Metallic       15523
Uni/basic        390
Perl effect        6
Name: count, dtype: int64


### upholstery_color

In [956]:
df.drop(['upholstery_color'], axis=1, inplace=True)

### upholstery_material

In [957]:
df['upholstery_material'].value_counts(dropna=False)

upholstery_material
Cloth           8423
NaN             4503
Part leather    1499
Full leather    1009
Other            368
Velour            60
alcantara         57
Name: count, dtype: int64

### gearing_type

In [958]:
df['gearing_type'].value_counts(dropna=False)

gearing_type
Manual            8153
Automatic         7297
Semi-automatic     469
Name: count, dtype: int64

### fuel

In [None]:
df['fuel']

### co2_emission

In [126]:
df['co2_emission'].value_counts(dropna=False)

co2_emission
NaN      2436
120.0     740
99.0      545
97.0      537
104.0     501
         ... 
51.0        1
165.0       1
331.0       1
80.0        1
193.0       1
Name: count, Length: 120, dtype: int64

- [] ERROR GOES AWAY WHEN CONVERT TO CATEGORICAL

In [127]:
fill("median", df, 'co2_emission', ['combined_emissions'])

Filling column: co2_emission
Grouping by: ['combined_emissions']
Number of NaN before filling: 2436
Number of NaN filled: 523
Number of NaN after filling: 1913
------------------
co2_emission
NaN      1913
120.0     796
97.0      551
99.0      548
104.0     514
         ... 
253.0       1
331.0       1
51.0        1
165.0       1
193.0       1
Name: count, Length: 120, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


### emission_class

In [128]:
df['emission_class'].value_counts(dropna=False)

emission_class
Euro 6    12173
NaN        3628
Euro 5       78
Euro 4       40
Name: count, dtype: int64

### drive_chain

In [129]:
df['drive_chain'].value_counts(dropna=False)

drive_chain
front    8886
NaN      6858
4WD       171
rear        4
Name: count, dtype: int64

### consumption l/100 km 

In [130]:
df['combined_emissions'].value_counts(dropna=False).index

Index([ nan,  3.9,  4.0,  5.4,  5.1,  4.4,  3.8,  5.6,  4.7,  4.8,  5.0,  4.5,
        5.2,  4.6,  4.2,  5.3,  3.7,  4.9,  5.5,  4.1,  5.9,  3.3,  5.7,  4.3,
        3.5,  6.0,  3.6,  6.2,  5.8,  6.3,  6.1,  6.8,  6.6,  3.4,  3.0,  6.4,
        7.4,  7.1,  6.5, 10.0,  6.7,  3.2,  6.9,  8.3,  7.6,  7.0,  3.1,  7.2,
        7.8,  8.0, 51.0,  8.7,  8.6,  7.3,  7.9,  8.1, 40.0, 38.0,  0.0, 11.0,
       43.0,  7.5, 13.8, 55.0, 54.0,  1.2, 32.0, 33.0, 50.0,  1.0, 46.0,  9.1],
      dtype='float64', name='combined_emissions')

In [131]:
df['city_emissions'].value_counts(dropna=False).index

Index([ nan,  5.0,  5.8,  4.5,  4.3,  4.0,  5.1,  6.0,  6.8,  4.6,  7.2,  5.7,
        7.3,  4.2,  5.9,  7.8,  6.6,  5.2,  4.1,  6.3,  5.4,  4.7,  6.7,  3.9,
        3.5,  7.6,  7.1,  7.5,  6.9,  5.5,  7.0,  6.2,  7.4,  7.7,  6.5,  8.7,
        6.1,  4.4,  8.2,  8.0,  5.3,  6.4,  5.6,  7.9,  4.8,  4.9,  3.7,  3.4,
        9.6,  9.2,  3.3,  8.5,  8.6,  8.3,  3.8, 10.2,  8.1, 11.3, 10.0,  9.9,
        9.4,  9.1,  3.0,  0.0,  8.4,  9.8,  1.0, 62.0, 11.2,  8.9, 11.0, 10.8,
       11.5,  8.8, 10.1, 45.0,  9.5, 43.0,  3.6, 16.1, 66.0, 10.4, 10.5,  9.0,
       64.0, 19.9,  9.7],
      dtype='float64', name='city_emissions')

In [132]:
df['country_emissions'].value_counts(dropna=False).index

Index([ nan,  4.2,  3.7,  4.4,  4.5,  3.8,  3.9,  4.1,  4.7,  4.0,  3.5,  4.3,
        3.6,  3.1,  3.3,  4.6,  4.9,  3.4,  4.8,  5.3,  5.1,  5.7,  5.4,  3.2,
        3.0,  5.6,  5.0,  5.2,  6.3,  6.0, 10.0,  5.8,  5.5,  7.7,  6.6,  2.9,
        6.4,  2.8,  0.0,  7.3, 44.0,  6.5,  7.1,  6.7,  7.0, 35.0,  5.9,  6.9,
        7.8, 37.0, 10.3,  7.6, 42.0,  8.6,  6.1,  8.0,  2.0,  1.0],
      dtype='float64', name='country_emissions')

### country_version

In [133]:
df['country_version'].value_counts(dropna=False)

country_version
NaN               8333
Germany           4502
Italy             1038
European Union     507
Netherlands        464
Spain              325
Belgium            314
Austria            208
Czech Republic      52
Poland              49
France              38
Denmark             33
Hungary             28
Japan                8
Slovakia             4
Croatia              4
Sweden               3
Romania              2
Bulgaria             2
Luxembourg           1
Switzerland          1
Slovenia             1
Egypt                1
Serbia               1
Name: count, dtype: int64

### entertainment_media

In [134]:
df['entertainment_media']

0        'Bluetooth', 'Hands-free equipment', 'On-board...
1        'Bluetooth', 'Hands-free equipment', 'On-board...
2                               'MP3', 'On-board computer'
3        'Bluetooth', 'CD player', 'Hands-free equipmen...
4        'Bluetooth', 'CD player', 'Hands-free equipmen...
                               ...                        
15914    'Bluetooth', 'Digital radio', 'Hands-free equi...
15915    'Bluetooth', 'Digital radio', 'Hands-free equi...
15916    'Bluetooth', 'Hands-free equipment', 'On-board...
15917         'Bluetooth', 'Digital radio', 'Radio', 'USB'
15918                                                'USB'
Name: entertainment_media, Length: 15919, dtype: object

In [135]:
df['entertainment_media'] = df['entertainment_media'].astype('str').str.replace('[','').str.replace("]",'')

#### This column was not changed as it will be transformed with getdummy function later

### safety_security

In [136]:
df['safety_security']

0        ['ABS', 'Central door lock', 'Daytime running ...
1        ['ABS', 'Central door lock', 'Central door loc...
2        ['ABS', 'Central door lock', 'Daytime running ...
3        ['ABS', 'Alarm system', 'Central door lock wit...
4        ['ABS', 'Central door lock', 'Driver-side airb...
                               ...                        
15914    ['ABS', 'Central door lock', 'Central door loc...
15915    ['ABS', 'Adaptive Cruise Control', 'Blind spot...
15916    ['ABS', 'Adaptive Cruise Control', 'Blind spot...
15917    ['ABS', 'Blind spot monitor', 'Driver-side air...
15918    ['ABS', 'Blind spot monitor', 'Daytime running...
Name: safety_security, Length: 15919, dtype: object

#### This column was not changed as it will be transformed with getdummy function later

### comfort_convenience

In [137]:
df['comfort_convenience']

0        'Air conditioning', 'Armrest', 'Automatic clim...
1        'Air conditioning', 'Automatic climate control...
2        'Air conditioning', 'Cruise control', 'Electri...
3        'Air suspension', 'Armrest', 'Auxiliary heatin...
4        'Air conditioning', 'Armrest', 'Automatic clim...
                               ...                        
15914    'Air conditioning', 'Automatic climate control...
15915    'Air conditioning', 'Automatic climate control...
15916    'Air conditioning', 'Armrest', 'Automatic clim...
15917    'Air conditioning', 'Automatic climate control...
15918    'Air conditioning', 'Automatic climate control...
Name: comfort_convenience, Length: 15919, dtype: object

### extras

In [138]:
df['extras']

0        ['Alloy wheels', 'Catalytic Converter', 'Voice...
1        ['Alloy wheels', 'Sport seats', 'Sport suspens...
2                        ['Alloy wheels', 'Voice Control']
3         ['Alloy wheels', 'Sport seats', 'Voice Control']
4        ['Alloy wheels', 'Sport package', 'Sport suspe...
                               ...                        
15914                     ['Alloy wheels', 'Touch screen']
15915    ['Alloy wheels', 'Touch screen', 'Voice Control']
15916                                     ['Alloy wheels']
15917                     ['Alloy wheels', 'Touch screen']
15918                     ['Alloy wheels', 'Touch screen']
Name: extras, Length: 15919, dtype: object

---

## Quantitative Columns
- [] price
- [] km
- [] hp
- [] displacement
- [] warranty
- [] weight
- [] nr_of_doors
- [] nr_of_seats
- [] cylinders
- [] gears

### price

In [139]:
df['price'].value_counts(dropna=False)

price
14990    154
15990    151
10990    139
15900    106
17990    102
        ... 
17559      1
17560      1
17570      1
17575      1
39875      1
Name: count, Length: 2956, dtype: int64

In [140]:
df['price'].isnull().values.any()

False

### km

In [141]:
df['km'].value_counts(dropna=False)

km
10.0       1045
NaN        1024
1.0         367
5.0         170
50.0        148
           ... 
67469.0       1
43197.0       1
10027.0       1
35882.0       1
57.0          1
Name: count, Length: 6690, dtype: int64

In [142]:
fill('median', df, 'km', ['make_model', 'body_type', 'age'])

Filling column: km
Grouping by: ['make_model', 'body_type', 'age']


KeyError: 'age'

### hp

In [None]:
df['hp'].isnull().sum()

In [None]:
fill('median', df, 'hp')

### displacement

In [528]:
df['displacement'].isnull().sum()

496

In [529]:
fill("median", df, 'displacement', 'weight')

Filling column: displacement
Grouping by: weight
Number of NaN before filling: 496
Number of NaN filled: 10
Number of NaN after filling: 486
------------------
displacement
1598.0    4761
999.0     2440
1398.0    1315
1399.0     749
1229.0     677
          ... 
122.0        1
1198.0       1
1195.0       1
2967.0       1
1800.0       1
Name: count, Length: 78, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


### warranty

In [531]:
df['warranty'].isnull().sum()

11066

In [534]:
fill('median', df, 'warranty')

Filling column: warranty
Grouping by: None
Number of NaN before filling: 11066
Number of NaN filled: 11066
Number of NaN after filling: 0
------------------
warranty
12.0    13660
24.0     1118
60.0      401
36.0      279
48.0      149
6.0       125
72.0       59
3.0        33
23.0       11
18.0       10
20.0        7
25.0        6
2.0         5
50.0        4
26.0        4
16.0        4
4.0         3
1.0         3
19.0        3
34.0        3
13.0        3
28.0        2
22.0        2
14.0        2
11.0        2
46.0        2
21.0        2
9.0         2
17.0        2
45.0        2
33.0        1
40.0        1
65.0        1
10.0        1
15.0        1
7.0         1
8.0         1
56.0        1
49.0        1
47.0        1
30.0        1
Name: count, dtype: int64


### weight

In [532]:
df['weight'].isnull().sum()

6974

In [533]:
fill("median", df, 'weight', 'displacement')

Filling column: weight
Grouping by: displacement
Number of NaN before filling: 6974
Number of NaN filled: 6395
Number of NaN after filling: 579
------------------
weight
1403.0    1759
1163.0    1422
1199.0    1262
NaN        579
1308.0     523
          ... 
1363.0       1
1507.0       1
2115.0       1
1764.0       1
2037.0       1
Name: count, Length: 437, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

### nr_of_doors

In [34]:
df['nr_of_doors'].value_counts()

nr_of_doors
5.0    11575
4.0     3079
3.0      832
2.0      219
1.0        1
7.0        1
Name: count, dtype: int64

### nr_of_seats

In [35]:
df['nr_of_seats'].value_counts()

nr_of_seats
5.0    13336
4.0     1125
7.0      362
2.0      116
6.0        2
3.0        1
Name: count, dtype: int64

### cylinders

In [36]:
df['cylinders'].value_counts(dropna=False)

cylinders
4.0    8105
NaN    5680
3.0    2104
5.0      22
6.0       3
8.0       2
2.0       2
1.0       1
Name: count, dtype: int64

### gears

In [37]:
df['gears'].value_counts(dropna=False)

gears
6.0     5822
NaN     4712
5.0     3239
7.0     1908
8.0      224
9.0        6
1.0        2
3.0        2
4.0        2
2.0        1
50.0       1
Name: count, dtype: int64

---

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 45 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   make_model           15919 non-null  object 
 1   body_type            15859 non-null  object 
 2   price                15919 non-null  int64  
 3   vat                  11406 non-null  object 
 4   km                   14895 non-null  float64
 5   registration         14322 non-null  object 
 6   hp                   15831 non-null  float64
 7   type                 15917 non-null  object 
 8   previous_owners      9254 non-null   float64
 9   next_inspection      2825 non-null   float64
 10  inspection_new       3570 non-null   object 
 11  warranty             4853 non-null   float64
 12  full_service         8215 non-null   object 
 13  non-smoking_vehicle  7177 non-null   object 
 14  null                 15919 non-null  object 
 15  offer_number         12744 non-null 

In [39]:
# df.to_csv("", index=False)