# Cleaning Car Data

## Columns:

**Categorical Columns**  
- [] make_model
- [] body_type
- [] vat
- [] registration_year
- [] previous_owners
- [] type
- [] next_inspection
- [] inspection_new
- [] body_color
- [] paint_type
- [] upholstery_color
- [] upholstery_material
- [] gearing_type
- [] fuel_type
- [] particulate
- [] co2_emission
- [] emission_class
- [] drive_chain
- [] consumption_country
- [] consumption_city
- [] consumption_combined
- [] entertainment_media
- [] safety_security
- [] comfort_convenience
- [] extras
 
**Quantitative Columns**
- [] price
- [] km
- [] hp
- [] displacement
- [] warranty
- [] weight
- [] nr_of_doors
- [] nr_of_seats
- [] cylinders
- [] gears


---

In [1008]:
import pandas as pd
import numpy as np

df = pd.read_csv("cleaned_car_data.csv")

In [1009]:
df.head(3).T

Unnamed: 0,0,1,2
make_model,Audi A1,Audi A1,Audi A1
body_type,Sedans,Sedans,Sedans
price,15770,14500,14640
vat,VAT deductible,Price negotiable,VAT deductible
km,56013.0,80000.0,83450.0
hp,66.0,141.0,85.0
type,Used,Used,Used
previous_owners,2.0,,1.0
next_inspection,2021.0,,
inspection_new,Yes,,


In [1010]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15919 entries, 0 to 15918
Data columns (total 42 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   make_model            15919 non-null  object 
 1   body_type             15859 non-null  object 
 2   price                 15919 non-null  int64  
 3   vat                   11406 non-null  object 
 4   km                    14895 non-null  float64
 5   hp                    15831 non-null  float64
 6   type                  15917 non-null  object 
 7   previous_owners       9254 non-null   float64
 8   next_inspection       2825 non-null   float64
 9   inspection_new        3570 non-null   object 
 10  warranty              4853 non-null   float64
 11  full_service          8215 non-null   object 
 12  non-smoking_vehicle   7177 non-null   object 
 13  null                  15919 non-null  object 
 14  offer_number          12744 non-null  object 
 15  first_registration 

---

## Fill Missing Values Function


### use q cut on numerical to turn into categorical.
### also use other methods

In [1011]:
def fill(method, df, column, group_cols=None):
    """
    Fills NaN values in `df[column]` either using the overall mean, median or mode (no grouping)
    or group-specific mean, median or mode (group_col provided).
    Prints stats about how many NaNs were filled and the final distribution.
    """
    # Debug prints: which column is being filled, and grouping info.
    print('Filling column:', column)
    print('Grouping by:', group_cols)

    # 1. Count NaNs before filling
    nan_before = df[column].isnull().sum()

    # 2. Fill logic
    if method == 'mean':
            if group_cols is None:
                # Fill all NaN in 'column' with the overall mean
                df[column].fillna(df[column].mean(), inplace=True)
            else:
                # Calculate groupwise mean for each row
                group = df.groupby(group_cols)[column].transform(lambda x: x.mean())
                # Fill missing values in df[column] with corresponding group mean
                df[column].fillna(group, inplace=True)
    
    elif method == 'median':
        if group_cols is None:
            # Fill all NaN in 'column' with the overall median
            df[column].fillna(df[column].median(), inplace=True)
        else:
            # Calculate groupwise median for each row
            group = df.groupby(group_cols)[column].transform(lambda x: x.median())
            # Fill xmissing values in df[column] with corresponding group median
            df[column].fillna(group, inplace=True)
    
    elif method == 'mode':
        if group_cols is None:
            # Fill all NaN in 'column' with the overall mode
            df[column].fillna(df[column].mode().iloc[0], inplace=True)            
        else:
            # Calculate groupwise mode for each row
            group = df.groupby(group_cols)[column]\
            .transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)
            # Fill missing values in df[column] with corresponding group mode
            df[column].fillna(group, inplace=True)

    # 3. Count NaNs after filling
    nan_after = df[column].isnull().sum()
    nan_filled = nan_before - nan_after

    # 4. Print final stats
    print("Number of NaN before filling:", nan_before)
    print("Number of NaN filled:", nan_filled)
    print("Number of NaN after filling:", nan_after)
    print("------------------")
    print(df[column].value_counts(dropna=False))

---

## Provided Functions

In [1012]:
def fill_most_freq(df, group_col, col_name):
    
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != [] :
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ", df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [1013]:
def fill_prop(df, group_col, col_name):
    
    '''Fills the missing values with "ffill and bfill method" according to single-stage grouping'''
    
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ", df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [1014]:
def double_stage(df, group_col1, group_col2, col_name, method): # method can be either "mode" or "mean" or "median" or "ffill"
    
    '''Fills the missing values with "mode/mean/median/ffill/bfill method" according to double-stage grouping'''
    
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != [] :
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != [] :
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])

    elif method == "mean":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("mean"), inplace = True)
        df[col_name].fillna(df[col_name].mean(), inplace = True)
        
    elif method == "median":
        df[col_name].fillna(df.groupby([group_col1, group_col2])[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df.groupby(group_col1)[col_name].transform("median"), inplace = True)
        df[col_name].fillna(df[col_name].median(), inplace = True)
        
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

---

## Categorical Columns
- [x] make_model
- [x] body_type
- [x] vat
- [x] registration_year
- [x] previous_owners
- [] type
- [x] next_inspection
- [x] inspection_new
- [x] body_color
- [x] paint_type
- [] upholstery_color
- [] upholstery_material
- [x] gearing_type
- [] fuel_type
- [] particulate
- [] co2_emission
- [] emission_class
- [] drive_chain
- [] consumption_country
- [] consumption_city
- [] consumption_combined
- [] entertainment_media
- [] safety_security
- [] comfort_convenience
- [] extras

### make_model

In [1015]:
df['make_model'].value_counts(dropna=False)

make_model
Audi A3           3097
Audi A1           2614
Opel Insignia     2598
Opel Astra        2526
Opel Corsa        2219
Renault Clio      1839
Renault Espace     991
Renault Duster      34
Audi A2              1
Name: count, dtype: int64

### body_type

In [1016]:
df['body_type'].value_counts(dropna=False)

body_type
Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: count, dtype: int64

In [1017]:
df['make_model'].value_counts(dropna=False)

make_model
Audi A3           3097
Audi A1           2614
Opel Insignia     2598
Opel Astra        2526
Opel Corsa        2219
Renault Clio      1839
Renault Espace     991
Renault Duster      34
Audi A2              1
Name: count, dtype: int64

#### same make_model same body_type so we can use fill()

In [1018]:
fill('mode', df, 'body_type', 'make_model')

Filling column: body_type
Grouping by: make_model
Number of NaN before filling: 60
Number of NaN filled: 60
Number of NaN after filling: 0
------------------
body_type
Sedans           7925
Station wagon    3563
Compact          3155
Van               809
Other             290
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: count, dtype: int64


### vat

In [1019]:
df.vat.value_counts(dropna=False)

vat
VAT deductible      10980
NaN                  4513
Price negotiable      426
Name: count, dtype: int64

In [1020]:
df['vat'] = df['vat'].fillna('VAT undeductible') 

* All null values of vat column assigned as 'VAT undeductible'

### registration_year (converted to age)

In [1021]:
df.loc[((df['type'] == 'New') & df.registration_year.isnull()), 'registration_year']= 2019

 - #### new cars become 2019

In [1022]:
df.groupby('registration_year').km.mean()

registration_year
2016.0    77442.520958
2017.0    41754.940709
2018.0    18035.239072
2019.0     1653.107634
Name: km, dtype: float64

In [1023]:
df.loc[(df['registration_year'].isnull()) & (df.km <= 5000),'registration_year'] =2019

In [1024]:
df.loc[(df['registration_year'].isnull()) & (df.km > 50000),'registration_year'] =2016

In [1025]:
df.loc[(df['registration_year'].isnull()) & (df.km < 19000),'registration_year'] =2018

In [1026]:
df.loc[(df['registration_year'].isnull()) & (df.km < 40000),'registration_year'] =2017

- #### otherwise filled based on mileage

In [1027]:
df[(df['make_model'] == 'Audi A3') & (df['body_type'] == 'Sedans')].groupby('registration_year').price.mean()

registration_year
2016.0    16702.052388
2017.0    19970.030631
2018.0    22544.715092
2019.0    24859.973913
Name: price, dtype: float64

In [1028]:
df[(df['make_model'] == 'Opel Insignia')].groupby('registration_year').price.mean()

registration_year
2016.0    13606.709507
2017.0    16995.140917
2018.0    21390.167750
2019.0    32103.197015
Name: price, dtype: float64

In [1029]:
df.loc[(df['registration_year'].isnull())]

Unnamed: 0,make_model,body_type,price,vat,km,hp,type,previous_owners,next_inspection,inspection_new,...,extras,safety_security,gears,registration_year,upholstery_material,upholstery_color,particulate,combined_consumption,city_consumption,country_consumption
5237,Audi A3,Sedans,25400,VAT undeductible,,85.0,,,,,...,['Alloy wheels'],"['ABS', 'Central door lock', 'Driver-side airb...",7.0,,Cloth,,unparticulate,3.9,4.1,3.7
5329,Audi A3,Sedans,24900,VAT undeductible,,85.0,Pre-registered,,,,...,['Alloy wheels'],"['ABS', 'Central door lock', 'Driver-side airb...",7.0,,Cloth,,unparticulate,3.9,4.1,3.7
12550,Opel Insignia,Sedans,33800,VAT undeductible,,100.0,Employee's car,,,,...,"['Alloy wheels', 'Sport package', 'Touch scree...","['ABS', 'Blind spot monitor', 'Central door lo...",6.0,,Part leather,Black,unparticulate,5.4,7.0,4.6
12882,Opel Insignia,Station wagon,31318,VAT deductible,,100.0,Used,,,,...,"['Alloy wheels', 'Sport seats']","['ABS', 'Central door lock', 'Driver-side airb...",6.0,,Full leather,Black,unparticulate,4.3,,


In [1030]:
df['registration_year']=df['registration_year'].fillna(2019)

In [1031]:
df['registration_year'] = 2019 - df['registration_year']

df = df.rename(columns = {'registration_year': 'age'})

In [1032]:
df['age'].value_counts(dropna=False)

age
1.0    4525
0.0    4434
3.0    3679
2.0    3281
Name: count, dtype: int64

- #### year column converted to age column

### previous owners

In [1033]:
index=df[df["km"]<10].index

In [1034]:
df.loc[index,"previous_owners"]=0.0

In [1035]:
df.loc[index,"previous_owners"]=0.0

In [1036]:
df.loc[(df["type"]=="New") & (df["previous_owners"].isnull()),"previous_owners"]=1.0

In [1037]:
df.loc[((df["age"]==0) | (df["km"]<5000)) & (df["previous_owners"].isnull()),"previous_owners"]=1.0

In [1038]:
mode_group_po = df.groupby(['age'])['previous_owners'].transform(lambda x: x.mode()[0] if list(x.mode()) != [] else np.nan)

In [1039]:
df["previous_owners"].fillna(mode_group_po, inplace=True)

In [1040]:
df["previous_owners"].value_counts(dropna=False)

previous_owners
1.0    14287
0.0      835
2.0      778
3.0       17
4.0        2
Name: count, dtype: int64

### type

In [1041]:
df['type'].value_counts(dropna=False)

type
Used              11096
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
NaN                   2
Name: count, dtype: int64

#### Only 2 NaN so replaced with mode

In [1042]:
fill('mode', df, 'type')

Filling column: type
Grouping by: None
Number of NaN before filling: 2
Number of NaN filled: 2
Number of NaN after filling: 0
------------------
type
Used              11098
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
Name: count, dtype: int64


### next_inspection (dropped)

In [1043]:
df['next_inspection'].value_counts(dropna=False)

next_inspection
NaN       13094
2021.0     1401
2020.0      557
2022.0      483
2019.0      336
2018.0       26
2017.0        7
2023.0        5
2001.0        5
2016.0        3
2014.0        1
1921.0        1
Name: count, dtype: int64

In [1044]:
df.drop(["next_inspection"],axis=1,inplace=True)

### inspection_new

In [1045]:
df['inspection_new'].fillna('No', inplace=True)

In [1046]:
df['inspection_new'].value_counts(dropna=False)

inspection_new
No     12349
Yes     3570
Name: count, dtype: int64

### body_color

In [1047]:
df['body_color'].value_counts(dropna=False)

body_color
Black     3745
Grey      3505
White     3406
Silver    1647
Blue      1431
Red        957
NaN        597
Brown      289
Green      154
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: count, dtype: int64

In [1048]:
fill('mode', df, 'body_color', ['make_model', 'body_type'])

Filling column: body_color
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 597
Number of NaN filled: 594
Number of NaN after filling: 3
------------------
body_color
Black     3941
Grey      3836
White     3469
Silver    1647
Blue      1431
Red        957
Brown      289
Green      158
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
NaN          3
Gold         2
Name: count, dtype: int64


In [1049]:
fill('mode', df, 'body_color', ['make_model'])

Filling column: body_color
Grouping by: ['make_model']
Number of NaN before filling: 3
Number of NaN filled: 3
Number of NaN after filling: 0
------------------
body_color
Black     3941
Grey      3839
White     3469
Silver    1647
Blue      1431
Red        957
Brown      289
Green      158
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: count, dtype: int64


### paint_type

In [1050]:
fill('mode', df, 'paint_type', ['make_model', 'body_type'])

Filling column: paint_type
Grouping by: ['make_model', 'body_type']
Number of NaN before filling: 5772
Number of NaN filled: 5761
Number of NaN after filling: 11
------------------
paint_type
Metallic       15512
Uni/basic        390
NaN               11
Perl effect        6
Name: count, dtype: int64


In [1051]:
fill('mode', df, 'paint_type', 'make_model')

Filling column: paint_type
Grouping by: make_model
Number of NaN before filling: 11
Number of NaN filled: 11
Number of NaN after filling: 0
------------------
paint_type
Metallic       15523
Uni/basic        390
Perl effect        6
Name: count, dtype: int64


### upholstery_color

In [1052]:
df.drop(['upholstery_color'], axis=1, inplace=True)

### upholstery_material

In [1053]:
df['upholstery_material'].value_counts(dropna=False)

upholstery_material
Cloth           8423
NaN             4503
Part leather    1499
Full leather    1009
Other            368
Velour            60
alcantara         57
Name: count, dtype: int64

### gearing_type

In [1054]:
df['gearing_type'].value_counts(dropna=False)

gearing_type
Manual            8153
Automatic         7297
Semi-automatic     469
Name: count, dtype: int64

### fuel

In [1055]:
df['fuel']

0          d
1          d
2          d
3          d
4          d
        ... 
15914    NaN
15915      d
15916      -
15917      -
15918      o
Name: fuel, Length: 15919, dtype: object

### co2_emission

In [1056]:
df['co2_emission'].value_counts(dropna=False)

co2_emission
NaN      2436
120.0     740
99.0      545
97.0      537
104.0     501
         ... 
51.0        1
165.0       1
331.0       1
80.0        1
193.0       1
Name: count, Length: 120, dtype: int64

- [] ERROR GOES AWAY WHEN CONVERT TO CATEGORICAL

In [None]:
fill("median", df, 'co2_emission', ['combined_emissions'])

### emission_class

In [None]:
df['emission_class'].value_counts(dropna=False)

### drive_chain

In [None]:
df['drive_chain'].value_counts(dropna=False)

### consumption l/100 km 

In [None]:
df['combined_emissions'].value_counts(dropna=False).index

In [None]:
df['city_emissions'].value_counts(dropna=False).index

In [None]:
df['country_emissions'].value_counts(dropna=False).index

### country_version

In [None]:
df['country_version'].value_counts(dropna=False)

### entertainment_media

In [None]:
df['entertainment_media']

In [None]:
df['entertainment_media'] = df['entertainment_media'].astype('str').str.replace('[','').str.replace("]",'')

#### This column was not changed as it will be transformed with getdummy function later

### safety_security

In [None]:
df['safety_security']

#### This column was not changed as it will be transformed with getdummy function later

### comfort_convenience

In [None]:
df['comfort_convenience']

### extras

In [None]:
df['extras']

---

## Quantitative Columns
- [] price
- [] km
- [] hp
- [] displacement
- [] warranty
- [] weight
- [] nr_of_doors
- [] nr_of_seats
- [] cylinders
- [] gears

### price

In [None]:
df['price'].value_counts(dropna=False)

In [None]:
df['price'].isnull().values.any()

### km

In [None]:
df['km'].value_counts(dropna=False)

In [None]:
fill('median', df, 'km', ['make_model', 'body_type', 'age'])

### hp

In [None]:
df['hp'].isnull().sum()

In [None]:
fill('median', df, 'hp')

### displacement

In [None]:
df['displacement'].isnull().sum()

In [None]:
fill("median", df, 'displacement', 'weight')

### warranty

In [None]:
df['warranty'].isnull().sum()

In [None]:
fill('median', df, 'warranty')

### weight

In [None]:
df['weight'].isnull().sum()

In [None]:
fill("median", df, 'weight', 'displacement')

### nr_of_doors

In [None]:
df['nr_of_doors'].value_counts()

### nr_of_seats

In [None]:
df['nr_of_seats'].value_counts()

### cylinders

In [None]:
df['cylinders'].value_counts(dropna=False)

### gears

In [None]:
df['gears'].value_counts(dropna=False)

---

In [None]:
df.info()

In [None]:
# df.to_csv("", index=False)