In [1]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Scikit learn version: {skl.__version__}')

Pandas version: 2.3.2
Numpy version: 2.3.3
Scikit learn version: 1.7.2


In [2]:
df = pd.read_csv('Car Insurance.csv')
df

Unnamed: 0,Make,Age,Mileage,Fuel,Gearbox,Colour,Claimed
0,Toyota,2.0,27000.0,P,A,Red,Yes
1,Ford,4.0,30500.0,P,M,Black,Yes
2,Toyota,15.0,120000.0,D,A,White,No
3,Nissan,13.0,53000.0,D,M,White,No
4,Nissan,2.0,,D,M,Black,No
5,Ford,8.0,73000.0,,M,Green,No
6,Toyota,,138000.0,P,M,,Yes
7,Nissan,20.0,38000.0,D,,Green,Yes
8,Toyota,13.0,67000.0,D,A,Blue,No
9,Nissan,7.0,36000.0,P,M,Black,Yes


In [7]:
num_inputer = SimpleImputer(missing_values=np.nan, strategy='mean')
num_inputer = num_inputer.fit(df[['Age', 'Mileage']])
num_series = num_inputer.transform(df[['Age', 'Mileage']])
print('num_series\n', num_series)

cat_inputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='**')
cat_inputer = cat_inputer.fit(df[['Fuel', 'Gearbox', 'Colour']])
cat_series = cat_inputer.transform(df[['Fuel', 'Gearbox', 'Colour']])
print('cat_series\n', cat_series)

num_series
 [[2.00000000e+00 2.70000000e+04]
 [4.00000000e+00 3.05000000e+04]
 [1.50000000e+01 1.20000000e+05]
 [1.30000000e+01 5.30000000e+04]
 [2.00000000e+00 6.47222222e+04]
 [8.00000000e+00 7.30000000e+04]
 [9.33333333e+00 1.38000000e+05]
 [2.00000000e+01 3.80000000e+04]
 [1.30000000e+01 6.70000000e+04]
 [7.00000000e+00 3.60000000e+04]]
cat_series
 [['P' 'A' 'Red']
 ['P' 'M' 'Black']
 ['D' 'A' 'White']
 ['D' 'M' 'White']
 ['D' 'M' 'Black']
 ['**' 'M' 'Green']
 ['P' 'M' '**']
 ['D' '**' 'Green']
 ['D' 'A' 'Blue']
 ['P' 'M' 'Black']]


[['P' 'A' 'Red']
 ['P' 'M' 'Black']
 ['D' 'A' 'White']
 ['D' 'M' 'White']
 ['D' 'M' 'Black']
 ['**' 'M' 'Green']
 ['P' 'M' '**']
 ['D' '**' 'Green']
 ['D' 'A' 'Blue']
 ['P' 'M' 'Black']]


In [31]:
df[['Age', 'Mileage']] = num_inputer.transform(df[['Age', 'Mileage']])
df[['Fuel', 'Gearbox', 'Colour']] = cat_inputer.transform(df[['Fuel', 'Gearbox', 'Colour']])

df

Unnamed: 0,Make,Age,Mileage,Fuel,Gearbox,Colour,Claimed
0,Toyota,2.0,27000.0,P,A,Red,Yes
1,Ford,4.0,30500.0,P,M,Black,Yes
2,Toyota,15.0,120000.0,D,A,White,No
3,Nissan,13.0,53000.0,D,M,White,No
4,Nissan,2.0,64722.222222,D,M,Black,No
5,Ford,8.0,73000.0,**,M,Green,No
6,Toyota,9.333333,138000.0,P,M,**,Yes
7,Nissan,20.0,38000.0,D,**,Green,Yes
8,Toyota,13.0,67000.0,D,A,Blue,No
9,Nissan,7.0,36000.0,P,M,Black,Yes


In [32]:
ord_enc = preprocessing.OrdinalEncoder()
ord_enc.fit_transform(df[['Make', 'Fuel', 'Gearbox', 'Colour', 'Claimed']])

hot_enc = preprocessing.OneHotEncoder(drop='first')
one_hots = hot_enc.fit_transform(df[['Make', 'Fuel', 'Gearbox', 'Colour', 'Claimed']]).toarray()
print(hot_enc.categories_)

[array(['Ford', 'Nissan', 'Toyota'], dtype=object), array(['**', 'D', 'P'], dtype=object), array(['**', 'A', 'M'], dtype=object), array(['**', 'Black', 'Blue', 'Green', 'Red', 'White'], dtype=object), array(['No', 'Yes'], dtype=object)]


In [33]:
cols = []
for i in ord_enc.categories_:
    print('before: ', i)
    i = np.delete(i, 0)
    print('after: ', i)
    cols.extend(i)
df = df.join(pd.DataFrame(one_hots, columns=cols))
print('df before')
df = df.drop(['Make', 'Fuel', 'Gearbox', 'Colour'], axis=1)
df

before:  ['Ford' 'Nissan' 'Toyota']
after:  ['Nissan' 'Toyota']
before:  ['**' 'D' 'P']
after:  ['D' 'P']
before:  ['**' 'A' 'M']
after:  ['A' 'M']
before:  ['**' 'Black' 'Blue' 'Green' 'Red' 'White']
after:  ['Black' 'Blue' 'Green' 'Red' 'White']
before:  ['No' 'Yes']
after:  ['Yes']
df before


Unnamed: 0,Age,Mileage,Claimed,Nissan,Toyota,D,P,A,M,Black,Blue,Green,Red,White,Yes
0,2.0,27000.0,Yes,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,4.0,30500.0,Yes,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,15.0,120000.0,No,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,13.0,53000.0,No,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.0,64722.222222,No,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,8.0,73000.0,No,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
6,9.333333,138000.0,Yes,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7,20.0,38000.0,Yes,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,13.0,67000.0,No,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,7.0,36000.0,Yes,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
