# Phase 2 - Filling Missing Values

## Columns:

**Categorical Columns**  
- make_model
- body_type
- vat
- registration_year
- type
- next_inspection
- inspection_new
- body_color
- paint_type
- upholstery_color
- upholstery_material
- gearing_type
- fuel
- emission_class
- drive_chain
- entertainment_media
- safety_security
- comfort_convenience
- extras
 
**Quantitative Columns**
- price
- km
- co2_emission
- consumption_country
- consumption_city
- consumption_combined
- hp
- displacement
- warranty
- previous_owners
- weight
- nr_of_doors
- nr_of_seats
- cylinders
- gears


---

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("car_data_1_cleaned.csv")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 40 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   make_model            15919 non-null  object 
 1   body_type             15859 non-null  object 
 2   price                 15919 non-null  int64  
 3   vat                   11406 non-null  object 
 4   km                    14895 non-null  float64
 5   hp                    15831 non-null  float64
 6   type                  15917 non-null  object 
 7   previous_owners       9254 non-null   float64
 8   next_inspection       2825 non-null   float64
 9   inspection_new        3570 non-null   object 
 10  warranty              4853 non-null   float64
 11  full_service          8215 non-null   object 
 12  non-smoking_vehicle   7177 non-null   object 
 13  null                  15919 non-null  object 
 14  offer_number          12744 non-null  object 
 15  first_registration 

---

## fill() Function


- eventually implement using q cut on numerical to turn into categorical?

In [3]:
def fill(method, df, column, group_cols=None):
   """
   Fills NaN values in `df[column]` either using the overall mean, median or mode (no grouping)
   or group-specific mean, median or mode (group_col provided).
   Prints stats about how many NaNs were filled and the final distribution.
   """
   # Make a copy of the dataframe to avoid modification warnings
   working_df = df.copy()
   
   # Debug prints: which column is being filled, and grouping info.
   print('Filling column:', column)
   print('Grouping by:', group_cols)
   
   # 1. Count NaNs before filling
   nan_before = working_df[column].isnull().sum()
   
   # 2. Fill logic
   if method in ['mean', 'median']:
       if group_cols is None:
           # Fill all NaN in 'column' with the overall mean/median
           working_df[column].fillna(getattr(working_df[column], method)(), inplace=True)
       else:
           # Calculate groupwise mean/median for each row
           group = working_df.groupby(group_cols)[column].transform(method)
           # Fill missing values in df[column] with corresponding group mean/median
           working_df[column].fillna(group, inplace=True)
   
   elif method == 'mode':
       if group_cols is None:
           # Fill all NaN in 'column' with the overall mode
           working_df[column].fillna(working_df[column].mode().iloc[0], inplace=True)            
       else:
           # Calculate groupwise mode for each row
           group = working_df.groupby(group_cols)[column]\
           .transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)
           # Fill missing values in df[column] with corresponding group mode
           working_df[column].fillna(group, inplace=True)
   
   # 3. Count NaNs after filling
   nan_after = working_df[column].isnull().sum()
   nan_filled = nan_before - nan_after
   
   # 4. Print final stats
   print("Number of NaN before filling:", nan_before)
   print("Number of NaN filled:", nan_filled)
   print("Number of NaN after filling:", nan_after)
   print("------------------")
   print(working_df[column].value_counts(dropna=False))
   
   return working_df

---

## Provided Functions

In [4]:
def fill_most_freq(df, group_col, col_name):
    
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != [] :
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ", df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [5]:
def fill_prop(df, group_col, col_name):
    
    '''Fills the missing values with "ffill and bfill method" according to single-stage grouping'''
    
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ", df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [6]:
def double_stage(df, group_col1, group_col2, col_name, method): # method can be either "mode" or "mean" or "median" or "ffill"
    
    '''Fills the missing values with "mode/mean/median/ffill/bfill method" according to double-stage grouping'''
    
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != [] :
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != [] :
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])

    elif method == "mean":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df[col_name].mean(), inplace = True)
        
    elif method == "median":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df[col_name].median(), inplace = True)
        
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

### make_model

---

## Categorical Columns
- [x] make_model
- [x] body_type
- [x] vat
- [x] registration_year
- [x] type
- [x] next_inspection
- [x] inspection_new
- [x] body_color
- [x] paint_type
- [x] upholstery_color
- [x] upholstery_material
- [x] gearing_type
- [x] fuel
- [x] emission_class
- [x] drive_chain
- [x] entertainment_media
- [x] safety_security
- [x] comfort_convenience
- [x] extras

### make_model

In [7]:
df['make_model'].value_counts(dropna=False)

make_model
Audi A3           3097
Audi A1           2614
Opel Insignia     2598
Opel Astra        2526
Opel Corsa        2219
Renault Clio      1839
Renault Espace     991
Renault Duster      34
Audi A2              1
Name: count, dtype: int64

### body_type

In [8]:
df['body_type'].value_counts(dropna=False)

body_type
Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: count, dtype: int64

In [9]:
df['make_model'].value_counts(dropna=False)

make_model
Audi A3           3097
Audi A1           2614
Opel Insignia     2598
Opel Astra        2526
Opel Corsa        2219
Renault Clio      1839
Renault Espace     991
Renault Duster      34
Audi A2              1
Name: count, dtype: int64

#### same make_model same body_type so we can use fill()

In [10]:
df = fill('mode', df, 'body_type', 'make_model')

Filling column: body_type
Grouping by: make_model
Number of NaN before filling: 60
Number of NaN filled: 60
Number of NaN after filling: 0
------------------
body_type
Sedans           7925
Station wagon    3563
Compact          3155
Van               809
Other             290
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: count, dtype: int64


### vat

In [11]:
df.vat.value_counts(dropna=False)

vat
VAT deductible      10980
NaN                  4513
Price negotiable      426
Name: count, dtype: int64

In [12]:
df['vat'] = df['vat'].fillna('VAT undeductible') 

* All null values of vat column assigned as 'VAT undeductible'

In [13]:
df.vat.value_counts(dropna=False)

vat
VAT deductible      10980
VAT undeductible     4513
Price negotiable      426
Name: count, dtype: int64

### registration_year (converted to age)

In [14]:
df.loc[((df['type'] == 'New') & df.registration_year.isnull()), 'registration_year']= 2019

 - #### new cars become 2019

In [15]:
df.groupby('registration_year').km.mean()

registration_year
2016.0    77442.520958
2017.0    41754.940709
2018.0    18035.239072
2019.0     1653.107634
Name: km, dtype: float64

In [16]:
df.loc[(df['registration_year'].isnull()) & (df.km <= 5000),'registration_year'] =2019

In [17]:
df.loc[(df['registration_year'].isnull()) & (df.km > 50000),'registration_year'] =2016

In [18]:
df.loc[(df['registration_year'].isnull()) & (df.km < 19000),'registration_year'] =2018

In [19]:
df.loc[(df['registration_year'].isnull()) & (df.km < 40000),'registration_year'] =2017

- #### otherwise filled based on mileage

In [20]:
df[(df['make_model'] == 'Audi A3') & (df['body_type'] == 'Sedans')].groupby('registration_year').price.mean()

registration_year
2016.0    16702.052388
2017.0    19970.030631
2018.0    22544.715092
2019.0    24859.973913
Name: price, dtype: float64

In [21]:
df[(df['make_model'] == 'Opel Insignia')].groupby('registration_year').price.mean()

registration_year
2016.0    13606.709507
2017.0    16995.140917
2018.0    21390.167750
2019.0    32103.197015
Name: price, dtype: float64

In [22]:
df.loc[(df['registration_year'].isnull())]

Unnamed: 0,make_model,body_type,price,vat,km,hp,type,previous_owners,next_inspection,inspection_new,...,entertainment_media,extras,safety_security,gears,registration_year,upholstery_material,upholstery_color,consumption_combined,consumption_city,consumption_country
5237,Audi A3,Sedans,25400,VAT undeductible,,85.0,,,,,...,"'Bluetooth', 'Hands-free equipment'",['Alloy wheels'],"['ABS', 'Central door lock', 'Driver-side airb...",7.0,,Cloth,,3.9,4.1,3.7
5329,Audi A3,Sedans,24900,VAT undeductible,,85.0,Pre-registered,,,,...,"'Bluetooth', 'Hands-free equipment'",['Alloy wheels'],"['ABS', 'Central door lock', 'Driver-side airb...",7.0,,Cloth,,3.9,4.1,3.7
12550,Opel Insignia,Sedans,33800,VAT undeductible,,100.0,Employee's car,,,,...,"'Bluetooth', 'Hands-free equipment', 'MP3', 'O...","['Alloy wheels', 'Sport package', 'Touch scree...","['ABS', 'Blind spot monitor', 'Central door lo...",6.0,,Part leather,Black,5.4,7.0,4.6
12882,Opel Insignia,Station wagon,31318,VAT deductible,,100.0,Used,,,,...,"'Bluetooth', 'On-board computer'","['Alloy wheels', 'Sport seats']","['ABS', 'Central door lock', 'Driver-side airb...",6.0,,Full leather,Black,4.3,,


In [23]:
df['registration_year']=df['registration_year'].fillna(2019)

In [24]:
df['registration_year'] = 2019 - df['registration_year']

df = df.rename(columns = {'registration_year': 'age'})

In [25]:
df['age'].value_counts(dropna=False)

age
1.0    4525
0.0    4434
3.0    3679
2.0    3281
Name: count, dtype: int64

- #### year column converted to age column

### type

In [26]:
df['type'].value_counts(dropna=False)

type
Used              11096
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
NaN                   2
Name: count, dtype: int64

#### Only 2 NaN so replaced with mode

In [27]:
df = fill('mode', df, 'type')

Filling column: type
Grouping by: None
Number of NaN before filling: 2
Number of NaN filled: 2
Number of NaN after filling: 0
------------------
type
Used              11098
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
Name: count, dtype: int64


### next_inspection (dropped)

In [28]:
df['next_inspection'].value_counts(dropna=False)

next_inspection
NaN       13094
2021.0     1401
2020.0      557
2022.0      483
2019.0      336
2018.0       26
2017.0        7
2023.0        5
2001.0        5
2016.0        3
2014.0        1
1921.0        1
Name: count, dtype: int64

In [29]:
df.drop(["next_inspection"],axis=1,inplace=True)

### inspection_new

In [30]:
df['inspection_new'].fillna('No', inplace=True)

In [31]:
df['inspection_new'].value_counts(dropna=False)

inspection_new
No     12349
Yes     3570
Name: count, dtype: int64

### body_color

In [32]:
df['body_color'].value_counts(dropna=False)

body_color
Black     3745
Grey      3505
White     3406
Silver    1647
Blue      1431
Red        957
NaN        597
Brown      289
Green      154
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: count, dtype: int64

In [33]:
df = fill('mode', df, 'body_color', ['make_model', 'body_type'])

Filling column: body_color
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 597
Number of NaN filled: 594
Number of NaN after filling: 3
------------------
body_color
Black     3941
Grey      3836
White     3469
Silver    1647
Blue      1431
Red        957
Brown      289
Green      158
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
NaN          3
Gold         2
Name: count, dtype: int64


In [34]:
df = fill('mode', df, 'body_color', ['make_model'])

Filling column: body_color
Grouping by: ['make_model']
Number of NaN before filling: 3
Number of NaN filled: 3
Number of NaN after filling: 0
------------------
body_color
Black     3941
Grey      3839
White     3469
Silver    1647
Blue      1431
Red        957
Brown      289
Green      158
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: count, dtype: int64


### paint_type

In [35]:
df = fill('mode', df, 'paint_type', ['make_model', 'body_type'])

Filling column: paint_type
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 5772
Number of NaN filled: 5761
Number of NaN after filling: 11
------------------
paint_type
Metallic       15512
Uni/basic        390
NaN               11
Perl effect        6
Name: count, dtype: int64


In [36]:
df = fill('mode', df, 'paint_type', 'make_model')

Filling column: paint_type
Grouping by: make_model
Number of NaN before filling: 11
Number of NaN filled: 11
Number of NaN after filling: 0
------------------
paint_type
Metallic       15523
Uni/basic        390
Perl effect        6
Name: count, dtype: int64


### upholstery_color (dropped)

In [37]:
df.drop(['upholstery_color'], axis=1, inplace=True)

### upholstery_material

In [38]:
df['upholstery_material'].value_counts(dropna=False)

upholstery_material
Cloth           8423
NaN             4503
Part leather    1499
Full leather    1009
Other            368
Velour            60
alcantara         57
Name: count, dtype: int64

In [39]:
df = fill('mode', df, 'upholstery_material', ['make_model', 'body_type'])

Filling column: upholstery_material
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 4503
Number of NaN filled: 4494
Number of NaN after filling: 9
------------------
upholstery_material
Cloth           12457
Part leather     1552
Full leather     1380
Other             404
Velour             60
alcantara          57
NaN                 9
Name: count, dtype: int64


In [40]:
df = fill('mode', df, 'upholstery_material')

Filling column: upholstery_material
Grouping by: None
Number of NaN before filling: 9
Number of NaN filled: 9
Number of NaN after filling: 0
------------------
upholstery_material
Cloth           12466
Part leather     1552
Full leather     1380
Other             404
Velour             60
alcantara          57
Name: count, dtype: int64


### gearing_type

In [41]:
df['gearing_type'].value_counts(dropna=False)

gearing_type
Manual            8153
Automatic         7297
Semi-automatic     469
Name: count, dtype: int64

### fuel

In [42]:
df['fuel'].value_counts(dropna=False)

fuel
Benzine     8551
Diesel      7299
LPG/CNG       64
Electric       5
Name: count, dtype: int64

### emission_class

In [43]:
df = fill("mode", df, 'emission_class', 'co2_emission')

Filling column: emission_class
Grouping by: co2_emission
Number of NaN before filling: 3628
Number of NaN filled: 1636
Number of NaN after filling: 1992
------------------
emission_class
Euro 6    13809
NaN        1992
Euro 5       78
Euro 4       40
Name: count, dtype: int64


In [44]:
df = fill("mode", df, 'emission_class', 'age')

Filling column: emission_class
Grouping by: age
Number of NaN before filling: 1992
Number of NaN filled: 1992
Number of NaN after filling: 0
------------------
emission_class
Euro 6    15801
Euro 5       78
Euro 4       40
Name: count, dtype: int64


### drive_chain

In [45]:
df = fill("mode", df, 'drive_chain', ['make_model', 'body_type'])

Filling column: drive_chain
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 6858
Number of NaN filled: 6846
Number of NaN after filling: 12
------------------
drive_chain
front    15699
4WD        204
NaN         12
rear         4
Name: count, dtype: int64


In [46]:
df = fill("mode", df, 'drive_chain', ['make_model'])

Filling column: drive_chain
Grouping by: ['make_model']
Number of NaN before filling: 12
Number of NaN filled: 12
Number of NaN after filling: 0
------------------
drive_chain
front    15711
4WD        204
rear         4
Name: count, dtype: int64


### entertainment_media

In [47]:
df['entertainment_media']

0        'Bluetooth', 'Hands-free equipment', 'On-board...
1        'Bluetooth', 'Hands-free equipment', 'On-board...
2                               'MP3', 'On-board computer'
3        'Bluetooth', 'CD player', 'Hands-free equipmen...
4        'Bluetooth', 'CD player', 'Hands-free equipmen...
                               ...                        
15914    'Bluetooth', 'Digital radio', 'Hands-free equi...
15915    'Bluetooth', 'Digital radio', 'Hands-free equi...
15916    'Bluetooth', 'Hands-free equipment', 'On-board...
15917         'Bluetooth', 'Digital radio', 'Radio', 'USB'
15918                                                'USB'
Name: entertainment_media, Length: 15919, dtype: object

In [48]:
df['entertainment_media'] = df['entertainment_media'].astype('str').str.replace('[','').str.replace("]",'')

#### This column was not changed as it will be transformed with getdummy function later

### safety_security

In [49]:
df = fill("mode", df, 'safety_security', ['make_model', 'body_type'])

Filling column: safety_security
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 982
Number of NaN filled: 982
Number of NaN after filling: 0
------------------
safety_security
['ABS', 'Central door lock', 'Daytime running lights', 'Driver-side airbag', 'Electronic stability control', 'Fog lights', 'Immobilizer', 'Isofix', 'Passenger-side airbag', 'Power steering', 'Side airbag', 'Tire pressure monitoring system', 'Traction control']                                                                                                                                                     729
['ABS', 'Central door lock', 'Daytime running lights', 'Driver-side airbag', 'Electronic stability control', 'Immobilizer', 'Isofix', 'Passenger-side airbag', 'Power steering', 'Side airbag', 'Tire pressure monitoring system', 'Traction control']                                                                                                                                              

#### This column was not changed as it will be transformed with getdummy function later

### comfort_convenience

In [50]:
df = fill("mode", df, 'comfort_convenience', ['make_model', 'body_type'])

Filling column: comfort_convenience
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 920
Number of NaN filled: 920
Number of NaN after filling: 0
------------------
comfort_convenience
'Air conditioning', 'Electrical side mirrors', 'Hill Holder', 'Power windows'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            315
'Air conditioning', 'Armrest', 'Automatic climate control', 'Cruise control', 'Electrical side mirrors', 'Leather steering wheel', 'Light sensor', 'Lumbar support', 'Multi-function steering wheel', 'Navigation system', 'Park Distance Co

### extras

In [51]:
df = fill("mode", df, 'extras', ['make_model', 'body_type'])

Filling column: extras
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 2962
Number of NaN filled: 2951
Number of NaN after filling: 11
------------------
extras
['Alloy wheels']                                                                                                                 5870
['Alloy wheels', 'Touch screen']                                                                                                  697
['Alloy wheels', 'Voice Control']                                                                                                 582
['Alloy wheels', 'Touch screen', 'Voice Control']                                                                                 544
['Roof rack']                                                                                                                     538
                                                                                                                                 ... 
['Alloy whee

In [52]:
df =fill("mode", df, 'extras', 'make_model')

Filling column: extras
Grouping by: make_model
Number of NaN before filling: 11
Number of NaN filled: 11
Number of NaN after filling: 0
------------------
extras
['Alloy wheels']                                                                                                                 5881
['Alloy wheels', 'Touch screen']                                                                                                  697
['Alloy wheels', 'Voice Control']                                                                                                 582
['Alloy wheels', 'Touch screen', 'Voice Control']                                                                                 544
['Roof rack']                                                                                                                     538
                                                                                                                                 ... 
['Alloy wheels', 'Catalytic Conver

---

## Quantitative Columns
- [x] price
- [x] km
- [x] co2_emission
- [x] consumption_country
- [x] consumption_city
- [x] consumption_combined
- [x] hp
- [x] displacement
- [x] warranty
- [x] previous_owners
- [x] weight
- [x] nr_of_doors
- [x] nr_of_seats
- [x] cylinders
- [x] gears

### price

In [53]:
df['price'].value_counts(dropna=False)

price
14990    154
15990    151
10990    139
15900    106
17990    102
        ... 
17559      1
17560      1
17570      1
17575      1
39875      1
Name: count, Length: 2956, dtype: int64

In [54]:
df['price'].isnull().values.any()

False

### km

In [55]:
df['km'].value_counts(dropna=False)

km
10.0       1045
NaN        1024
1.0         367
5.0         170
50.0        148
           ... 
67469.0       1
43197.0       1
10027.0       1
35882.0       1
57.0          1
Name: count, Length: 6690, dtype: int64

In [56]:
df = fill('median', df, 'km', ['make_model', 'body_type', 'age'])

Filling column: km
Grouping by: ['make_model', 'body_type', 'age']
Number of NaN before filling: 1024
Number of NaN filled: 1020
Number of NaN after filling: 4
------------------
km
10.0        1277
1.0          367
100.0        197
750.0        181
5.0          174
            ... 
199000.0       1
157391.0       1
180573.0       1
87195.0        1
57.0           1
Name: count, Length: 6695, dtype: int64


In [57]:
df = fill('median', df, 'km', ['make_model', 'age'])

Filling column: km
Grouping by: ['make_model', 'age']
Number of NaN before filling: 4
Number of NaN filled: 4
Number of NaN after filling: 0
------------------
km
10.0       1280
1.0         367
100.0       197
750.0       181
5.0         174
           ... 
36020.0       1
53433.0       1
67469.0       1
43197.0       1
57.0          1
Name: count, Length: 6694, dtype: int64


### co2_emission

In [58]:
df['co2_emission'].value_counts(dropna=False)

co2_emission
NaN      2436
120.0     740
99.0      545
97.0      537
104.0     501
         ... 
51.0        1
165.0       1
331.0       1
80.0        1
193.0       1
Name: count, Length: 120, dtype: int64

In [59]:
df = fill("mode", df, 'co2_emission', ['make_model', 'displacement'])

Filling column: co2_emission
Grouping by: ['make_model', 'displacement']
Number of NaN before filling: 2436
Number of NaN filled: 1950
Number of NaN after filling: 486
------------------
co2_emission
120.0    984
97.0     773
99.0     692
114.0    577
119.0    556
        ... 
51.0       1
165.0      1
160.0      1
80.0       1
193.0      1
Name: count, Length: 120, dtype: int64


In [60]:
df = fill("mode", df, 'co2_emission', 'make_model')

Filling column: co2_emission
Grouping by: make_model
Number of NaN before filling: 486
Number of NaN filled: 485
Number of NaN after filling: 1
------------------
co2_emission
120.0    1141
97.0      793
99.0      762
114.0     653
119.0     622
         ... 
990.0       1
80.0        1
160.0       1
331.0       1
193.0       1
Name: count, Length: 120, dtype: int64


In [61]:
df = fill("mode", df, 'co2_emission', 'body_type')

Filling column: co2_emission
Grouping by: body_type
Number of NaN before filling: 1
Number of NaN filled: 1
Number of NaN after filling: 0
------------------
co2_emission
120.0    1141
97.0      793
99.0      762
114.0     653
119.0     622
         ... 
165.0       1
80.0        1
160.0       1
331.0       1
193.0       1
Name: count, Length: 119, dtype: int64


### consumption_city (mode for domain reasons)

In [62]:
df = fill("mode", df, 'consumption_city', 'co2_emission')

Filling column: consumption_city
Grouping by: co2_emission
Number of NaN before filling: 2422
Number of NaN filled: 2421
Number of NaN after filling: 1
------------------
consumption_city
5.0     1509
6.8      826
4.3      738
5.8      689
4.5      608
        ... 
19.9       1
64.0       1
10.5       1
66.0       1
9.7        1
Name: count, Length: 87, dtype: int64


In [63]:
df = fill("mode", df, 'consumption_city', 'displacement')

Filling column: consumption_city
Grouping by: displacement
Number of NaN before filling: 1
Number of NaN filled: 1
Number of NaN after filling: 0
------------------
consumption_city
5.0     1509
6.8      826
4.3      738
5.8      689
4.5      608
        ... 
9.0        1
66.0       1
16.1       1
10.4       1
9.7        1
Name: count, Length: 86, dtype: int64


### consumption_country (mode for domain reasons)

In [64]:
df = fill("mode", df, 'consumption_country', 'co2_emission')

Filling column: consumption_country
Grouping by: co2_emission
Number of NaN before filling: 3212
Number of NaN filled: 3206
Number of NaN after filling: 6
------------------
consumption_country
3.7     1336
4.2     1290
4.4     1260
4.5     1096
3.8     1085
3.9     1038
3.5      731
4.7      724
3.1      690
3.6      604
4.0      578
4.1      574
4.9      560
4.3      526
4.6      500
3.3      470
3.4      373
4.8      342
5.3      303
5.7      296
5.1      262
5.4      235
5.6      194
3.2      180
3.0      142
5.0      106
5.2       84
6.3       55
6.0       41
5.5       36
10.0      33
5.8       33
7.7       20
6.6       19
6.4       17
2.9       16
0.0       15
2.8        9
7.1        6
NaN        6
6.5        4
44.0       4
7.3        4
35.0       2
7.8        2
6.7        2
5.9        2
7.0        2
6.9        2
37.0       2
2.0        1
8.0        1
8.6        1
6.1        1
7.6        1
42.0       1
10.3       1
1.0        1
Name: count, dtype: int64


In [65]:
df = fill("mode", df, 'consumption_country', 'displacement')

Filling column: consumption_country
Grouping by: displacement
Number of NaN before filling: 6
Number of NaN filled: 6
Number of NaN after filling: 0
------------------
consumption_country
3.7     1336
4.2     1293
4.4     1260
4.5     1097
3.8     1085
3.9     1038
3.5      731
4.7      724
3.1      690
3.6      604
4.0      578
4.1      574
4.9      560
4.3      526
4.6      500
3.3      470
3.4      373
4.8      342
5.3      304
5.7      296
5.1      262
5.4      235
5.6      194
3.2      180
3.0      142
5.0      106
5.2       84
6.3       55
6.0       41
5.5       36
10.0      34
5.8       33
7.7       20
6.6       19
6.4       17
2.9       16
0.0       15
2.8        9
7.1        6
7.3        4
44.0       4
6.5        4
6.9        2
7.0        2
35.0       2
37.0       2
5.9        2
7.8        2
6.7        2
2.0        1
8.0        1
8.6        1
10.3       1
6.1        1
7.6        1
42.0       1
1.0        1
Name: count, dtype: int64


### consumption_combined (mode for domain reasons)

- #### fill consumption_combined with the mean of consumption_city and consumption_country

In [66]:
df['consumption_combined'].fillna(((df['consumption_city'] + df['consumption_country'])/ 2), inplace=True)

In [67]:
df['consumption_combined'].value_counts(dropna=False)

consumption_combined
3.90     962
5.60     891
4.00     730
5.40     664
5.10     638
        ... 
55.00      1
7.50       1
3.35       1
43.00      1
4.25       1
Name: count, Length: 105, dtype: int64

In [68]:
df = fill("mode", df, 'consumption_combined', 'co2_emission')

Filling column: consumption_combined
Grouping by: co2_emission
Number of NaN before filling: 0
Number of NaN filled: 0
Number of NaN after filling: 0
------------------
consumption_combined
3.90     962
5.60     891
4.00     730
5.40     664
5.10     638
        ... 
55.00      1
7.50       1
3.35       1
43.00      1
4.25       1
Name: count, Length: 105, dtype: int64


In [69]:
df = fill("mode", df, 'consumption_combined', 'displacement')

Filling column: consumption_combined
Grouping by: displacement
Number of NaN before filling: 0
Number of NaN filled: 0
Number of NaN after filling: 0
------------------
consumption_combined
3.90     962
5.60     891
4.00     730
5.40     664
5.10     638
        ... 
55.00      1
7.50       1
3.35       1
43.00      1
4.25       1
Name: count, Length: 105, dtype: int64


### hp (mode for domain reasons)

In [70]:
df = fill('mode', df, 'hp', ['make_model', 'body_type', 'displacement'])

Filling column: hp
Grouping by: ['make_model', 'body_type', 'displacement']
Number of NaN before filling: 88
Number of NaN filled: 3
Number of NaN after filling: 85
------------------
hp
85.0     2542
66.0     2122
81.0     1402
100.0    1309
110.0    1113
         ... 
84.0        1
195.0       1
44.0        1
239.0       1
9.0         1
Name: count, Length: 81, dtype: int64


In [71]:
df = fill('mode', df, 'hp', ['make_model', 'body_type', 'cylinders'])

Filling column: hp
Grouping by: ['make_model', 'body_type', 'cylinders']
Number of NaN before filling: 85
Number of NaN filled: 8
Number of NaN after filling: 77
------------------
hp
85.0     2542
66.0     2122
81.0     1402
100.0    1309
110.0    1113
         ... 
84.0        1
195.0       1
44.0        1
239.0       1
9.0         1
Name: count, Length: 81, dtype: int64


In [72]:
df = fill('mode', df, 'hp', ['make_model', 'body_type'])

Filling column: hp
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 77
Number of NaN filled: 77
Number of NaN after filling: 0
------------------
hp
85.0     2543
66.0     2124
81.0     1402
100.0    1315
110.0    1114
         ... 
123.0       1
44.0        1
195.0       1
75.0        1
9.0         1
Name: count, Length: 80, dtype: int64


### displacement (mode for domain reasons)

In [73]:
df = fill("mode", df, 'displacement', ['make_model', 'body_type', 'hp'])

Filling column: displacement
Grouping by: ['make_model', 'body_type', 'hp']
Number of NaN before filling: 496
Number of NaN filled: 480
Number of NaN after filling: 16
------------------
displacement
1598.0    4978
999.0     2467
1398.0    1346
1399.0     766
1229.0     709
          ... 
122.0        1
1195.0       1
2967.0       1
1856.0       1
1800.0       1
Name: count, Length: 78, dtype: int64


In [74]:
df = fill("mode", df, 'displacement', ['make_model', 'body_type'])

Filling column: displacement
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 16
Number of NaN filled: 13
Number of NaN after filling: 3
------------------
displacement
1598.0    4984
999.0     2468
1398.0    1347
1399.0     768
1229.0     709
          ... 
2967.0       1
1369.0       1
1390.0       1
122.0        1
1800.0       1
Name: count, Length: 78, dtype: int64


In [75]:
df = fill("mode", df, 'displacement', 'make_model')

Filling column: displacement
Grouping by: make_model
Number of NaN before filling: 3
Number of NaN filled: 3
Number of NaN after filling: 0
------------------
displacement
1598.0    4987
999.0     2468
1398.0    1347
1399.0     768
1229.0     709
          ... 
1390.0       1
122.0        1
1198.0       1
2967.0       1
1800.0       1
Name: count, Length: 77, dtype: int64


### warranty

In [76]:
df['warranty'] = df['warranty'].fillna(0.0)

### previous owners

In [77]:
index=df[df["km"]<10].index

In [78]:
df.loc[index,"previous_owners"]=0.0

In [79]:
df.loc[index,"previous_owners"]=0.0

In [80]:
df.loc[(df["type"]=="New") & (df["previous_owners"].isnull()),"previous_owners"]=1.0

In [81]:
df.loc[((df["age"]==0) | (df["km"]<5000)) & (df["previous_owners"].isnull()),"previous_owners"]=1.0

In [82]:
mode_group_po = df.groupby(['age'])['previous_owners'].transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)

In [83]:
df["previous_owners"].fillna(mode_group_po, inplace=True)

In [84]:
df["previous_owners"].value_counts(dropna=False)

previous_owners
1.0    14278
0.0      844
2.0      778
3.0       17
4.0        2
Name: count, dtype: int64

### weight (mode for domain reasons)

In [85]:
df = fill("mode", df, 'weight', ['make_model', 'body_type', 'hp'])

Filling column: weight
Grouping by: ['make_model', 'body_type', 'hp']
Number of NaN before filling: 6974
Number of NaN filled: 6703
Number of NaN after filling: 271
------------------
weight
1163.0    1483
1360.0     944
1165.0     586
1135.0     453
1350.0     394
          ... 
840.0        1
1591.0       1
1137.0       1
1507.0       1
2037.0       1
Name: count, Length: 435, dtype: int64


In [86]:
df = fill("mode", df, 'weight', ['make_model', 'body_type'])

Filling column: weight
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 271
Number of NaN filled: 228
Number of NaN after filling: 43
------------------
weight
1163.0    1493
1360.0     960
1165.0     643
1135.0     455
1734.0     409
          ... 
840.0        1
1591.0       1
1137.0       1
1507.0       1
2037.0       1
Name: count, Length: 435, dtype: int64


In [87]:
df = fill("mode", df, 'weight', ['make_model', 'hp'])

Filling column: weight
Grouping by: ['make_model', 'hp']
Number of NaN before filling: 43
Number of NaN filled: 8
Number of NaN after filling: 35
------------------
weight
1163.0    1493
1360.0     960
1165.0     643
1135.0     455
1734.0     410
          ... 
840.0        1
1591.0       1
1137.0       1
1507.0       1
2037.0       1
Name: count, Length: 435, dtype: int64


In [88]:
df = fill("mode", df, 'weight', ['body_type'])

Filling column: weight
Grouping by: ['body_type']
Number of NaN before filling: 35
Number of NaN filled: 35
Number of NaN after filling: 0
------------------
weight
1163.0    1493
1360.0     960
1165.0     643
1135.0     455
1734.0     445
          ... 
840.0        1
1507.0       1
2115.0       1
1764.0       1
2037.0       1
Name: count, Length: 434, dtype: int64


### nr_of_doors

In [89]:
df = fill("mode", df, 'nr_of_doors', ['make_model', 'body_type'])

Filling column: nr_of_doors
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 212
Number of NaN filled: 210
Number of NaN after filling: 2
------------------
nr_of_doors
5.0    11785
4.0     3079
3.0      832
2.0      219
NaN        2
1.0        1
7.0        1
Name: count, dtype: int64


In [90]:
df = fill("mode", df, 'nr_of_doors', 'make_model')

Filling column: nr_of_doors
Grouping by: make_model
Number of NaN before filling: 2
Number of NaN filled: 2
Number of NaN after filling: 0
------------------
nr_of_doors
5.0    11787
4.0     3079
3.0      832
2.0      219
1.0        1
7.0        1
Name: count, dtype: int64


### nr_of_seats

In [91]:
df = fill("mode", df, 'nr_of_seats', ['make_model', 'body_type'])

Filling column: nr_of_seats
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 977
Number of NaN filled: 971
Number of NaN after filling: 6
------------------
nr_of_seats
5.0    14302
4.0     1127
7.0      362
2.0      119
NaN        6
6.0        2
3.0        1
Name: count, dtype: int64


In [92]:
df = fill("mode", df, 'nr_of_seats', 'make_model')

Filling column: nr_of_seats
Grouping by: make_model
Number of NaN before filling: 6
Number of NaN filled: 6
Number of NaN after filling: 0
------------------
nr_of_seats
5.0    14308
4.0     1127
7.0      362
2.0      119
6.0        2
3.0        1
Name: count, dtype: int64


### cylinders

In [93]:
df = fill("mode", df, 'cylinders', ['make_model', 'displacement'])

Filling column: cylinders
Grouping by: ['make_model', 'displacement']
Number of NaN before filling: 5680
Number of NaN filled: 5609
Number of NaN after filling: 71
------------------
cylinders
4.0    12404
3.0     3414
NaN       71
5.0       22
6.0        3
8.0        2
2.0        2
1.0        1
Name: count, dtype: int64


In [94]:
df = fill("mode", df, 'cylinders', ['make_model', 'weight'])

Filling column: cylinders
Grouping by: ['make_model', 'weight']
Number of NaN before filling: 71
Number of NaN filled: 65
Number of NaN after filling: 6
------------------
cylinders
4.0    12460
3.0     3423
5.0       22
NaN        6
6.0        3
8.0        2
2.0        2
1.0        1
Name: count, dtype: int64


In [95]:
df = fill("mode", df, 'cylinders', ['make_model'])

Filling column: cylinders
Grouping by: ['make_model']
Number of NaN before filling: 6
Number of NaN filled: 5
Number of NaN after filling: 1
------------------
cylinders
4.0    12465
3.0     3423
5.0       22
6.0        3
8.0        2
2.0        2
NaN        1
1.0        1
Name: count, dtype: int64


In [96]:
df = fill("mode", df, 'cylinders')

Filling column: cylinders
Grouping by: None
Number of NaN before filling: 1
Number of NaN filled: 1
Number of NaN after filling: 0
------------------
cylinders
4.0    12466
3.0     3423
5.0       22
6.0        3
8.0        2
2.0        2
1.0        1
Name: count, dtype: int64


### gears

In [97]:
df = fill('mode', df, 'gears', ['gearing_type'])

Filling column: gears
Grouping by: ['gearing_type']
Number of NaN before filling: 4712
Number of NaN filled: 4712
Number of NaN after filling: 0
------------------
gears
6.0     10509
5.0      3239
7.0      1933
8.0       224
9.0         6
1.0         2
3.0         2
4.0         2
2.0         1
50.0        1
Name: count, dtype: int64


---

## Export csv


In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 38 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   make_model            15919 non-null  object 
 1   body_type             15919 non-null  object 
 2   price                 15919 non-null  int64  
 3   vat                   15919 non-null  object 
 4   km                    15919 non-null  float64
 5   hp                    15919 non-null  float64
 6   type                  15919 non-null  object 
 7   previous_owners       15919 non-null  float64
 8   inspection_new        15919 non-null  object 
 9   warranty              15919 non-null  float64
 10  full_service          8215 non-null   object 
 11  non-smoking_vehicle   7177 non-null   object 
 12  null                  15919 non-null  object 
 13  offer_number          12744 non-null  object 
 14  first_registration    14322 non-null  object 
 15  body_color         

In [99]:
df.drop(['null','full_service', 'non-smoking_vehicle', 'model_code', 'offer_number', 'first_registration'], axis=1, inplace=True)

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   make_model            15919 non-null  object 
 1   body_type             15919 non-null  object 
 2   price                 15919 non-null  int64  
 3   vat                   15919 non-null  object 
 4   km                    15919 non-null  float64
 5   hp                    15919 non-null  float64
 6   type                  15919 non-null  object 
 7   previous_owners       15919 non-null  float64
 8   inspection_new        15919 non-null  object 
 9   warranty              15919 non-null  float64
 10  body_color            15919 non-null  object 
 11  paint_type            15919 non-null  object 
 12  nr_of_doors           15919 non-null  float64
 13  nr_of_seats           15919 non-null  float64
 14  gearing_type          15919 non-null  object 
 15  displacement       

In [101]:
df.to_csv("car_data_2_missing_values_filled.csv", index=False)