In [1]:
import pandas as pd
import numpy as np
import sklearn as skl
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Scikit learn version: {skl.__version__}')

Pandas version: 2.3.2
Numpy version: 2.3.3
Scikit learn version: 1.7.2


In [None]:
df = pd.read_csv('data/Car Insurance.csv')
df

Unnamed: 0,Make,Age,Mileage,Fuel,Gearbox,Colour,Claimed
0,Toyota,2.0,27000.0,P,A,Red,Yes
1,Ford,4.0,30500.0,P,M,Black,Yes
2,Toyota,15.0,120000.0,D,A,White,No
3,Nissan,13.0,53000.0,D,M,White,No
4,Nissan,2.0,,D,M,Black,No
5,Ford,8.0,73000.0,,M,Green,No
6,Toyota,,138000.0,P,M,,Yes
7,Nissan,20.0,38000.0,D,,Green,Yes
8,Toyota,13.0,67000.0,D,A,Blue,No
9,Nissan,7.0,36000.0,P,M,Black,Yes


In [8]:
num_inputer = SimpleImputer(missing_values=np.nan, strategy='mean')
num_inputer = num_inputer.fit(df[['Age', 'Mileage']])
num_series = num_inputer.transform(df[['Age', 'Mileage']])
print('num_series\n', num_series)

cat_inputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='**')
cat_inputer = cat_inputer.fit(df[['Fuel', 'Gearbox', 'Colour']])
cat_series = cat_inputer.transform(df[['Fuel', 'Gearbox', 'Colour']])
print('cat_series\n', cat_series)

num_series
 [[2.00000000e+00 2.70000000e+04]
 [4.00000000e+00 3.05000000e+04]
 [1.50000000e+01 1.20000000e+05]
 [1.30000000e+01 5.30000000e+04]
 [2.00000000e+00 6.47222222e+04]
 [8.00000000e+00 7.30000000e+04]
 [9.33333333e+00 1.38000000e+05]
 [2.00000000e+01 3.80000000e+04]
 [1.30000000e+01 6.70000000e+04]
 [7.00000000e+00 3.60000000e+04]]
cat_series
 [['P' 'A' 'Red']
 ['P' 'M' 'Black']
 ['D' 'A' 'White']
 ['D' 'M' 'White']
 ['D' 'M' 'Black']
 ['**' 'M' 'Green']
 ['P' 'M' '**']
 ['D' '**' 'Green']
 ['D' 'A' 'Blue']
 ['P' 'M' 'Black']]


In [9]:
df[['Age', 'Mileage']] = num_inputer.transform(df[['Age', 'Mileage']])
df[['Fuel', 'Gearbox', 'Colour']] = cat_inputer.transform(df[['Fuel', 'Gearbox', 'Colour']])

df

Unnamed: 0,Make,Age,Mileage,Fuel,Gearbox,Colour,Claimed
0,Toyota,2.0,27000.0,P,A,Red,Yes
1,Ford,4.0,30500.0,P,M,Black,Yes
2,Toyota,15.0,120000.0,D,A,White,No
3,Nissan,13.0,53000.0,D,M,White,No
4,Nissan,2.0,64722.222222,D,M,Black,No
5,Ford,8.0,73000.0,**,M,Green,No
6,Toyota,9.333333,138000.0,P,M,**,Yes
7,Nissan,20.0,38000.0,D,**,Green,Yes
8,Toyota,13.0,67000.0,D,A,Blue,No
9,Nissan,7.0,36000.0,P,M,Black,Yes


In [10]:
ord_encoder = preprocessing.OrdinalEncoder()
ords = ord_encoder.fit_transform(df[['Make', 'Fuel', 'Gearbox', 'Colour', 'Claimed']])
print('ords: \n', ords)

hot_encoder = preprocessing.OneHotEncoder(drop='first')
one_hots = hot_encoder.fit_transform(df[['Make', 'Fuel', 'Gearbox', 'Colour', 'Claimed']]).toarray()
print('hot encoder categories: \n', hot_encoder.categories_)

ords: 
 [[2. 2. 1. 4. 1.]
 [0. 2. 2. 1. 1.]
 [2. 1. 1. 5. 0.]
 [1. 1. 2. 5. 0.]
 [1. 1. 2. 1. 0.]
 [0. 0. 2. 3. 0.]
 [2. 2. 2. 0. 1.]
 [1. 1. 0. 3. 1.]
 [2. 1. 1. 2. 0.]
 [1. 2. 2. 1. 1.]]
hot encoder categories: 
 [array(['Ford', 'Nissan', 'Toyota'], dtype=object), array(['**', 'D', 'P'], dtype=object), array(['**', 'A', 'M'], dtype=object), array(['**', 'Black', 'Blue', 'Green', 'Red', 'White'], dtype=object), array(['No', 'Yes'], dtype=object)]


In [11]:
df['Claimed'] = ord_encoder.fit_transform(df['Claimed'].values.reshape(-1, 1))
cols = []
for i in hot_encoder.categories_:
    print('before: ', i)
    i = np.delete(i, 0)
    print('after: ', i)
    cols.extend(i)

df = df.join(pd.DataFrame(one_hots, columns=cols))
print('df before\n', df)
df = df.drop(['Make', 'Fuel', 'Gearbox', 'Colour'], axis=1)
print('df after\n', df)
df

before:  ['Ford' 'Nissan' 'Toyota']
after:  ['Nissan' 'Toyota']
before:  ['**' 'D' 'P']
after:  ['D' 'P']
before:  ['**' 'A' 'M']
after:  ['A' 'M']
before:  ['**' 'Black' 'Blue' 'Green' 'Red' 'White']
after:  ['Black' 'Blue' 'Green' 'Red' 'White']
before:  ['No' 'Yes']
after:  ['Yes']
df before
      Make        Age        Mileage Fuel Gearbox Colour  Claimed  Nissan  \
0  Toyota   2.000000   27000.000000    P       A    Red      1.0     0.0   
1    Ford   4.000000   30500.000000    P       M  Black      1.0     0.0   
2  Toyota  15.000000  120000.000000    D       A  White      0.0     0.0   
3  Nissan  13.000000   53000.000000    D       M  White      0.0     1.0   
4  Nissan   2.000000   64722.222222    D       M  Black      0.0     1.0   
5    Ford   8.000000   73000.000000   **       M  Green      0.0     0.0   
6  Toyota   9.333333  138000.000000    P       M     **      1.0     0.0   
7  Nissan  20.000000   38000.000000    D      **  Green      1.0     1.0   
8  Toyota  13.00000

Unnamed: 0,Age,Mileage,Claimed,Nissan,Toyota,D,P,A,M,Black,Blue,Green,Red,White,Yes
0,2.0,27000.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,4.0,30500.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,15.0,120000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,13.0,53000.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.0,64722.222222,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,8.0,73000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
6,9.333333,138000.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7,20.0,38000.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8,13.0,67000.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,7.0,36000.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [12]:
X = df.iloc[:,:-1].values
print('X \n', X)
Y = df.iloc[:, -1].values
print('Y \n', Y)

X 
 [[2.00000000e+00 2.70000000e+04 1.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00]
 [4.00000000e+00 3.05000000e+04 1.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
  1.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.50000000e+01 1.20000000e+05 0.00000000e+00 0.00000000e+00
  1.00000000e+00 1.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 [1.30000000e+01 5.30000000e+04 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 [2.00000000e+00 6.47222222e+04 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e

In [17]:
scaler = preprocessing.StandardScaler().fit(X)
print('scaler: \n', scaler)

scaler: 
 StandardScaler()


In [18]:
scaler.mean_

array([9.33333333e+00, 6.47222222e+04, 5.00000000e-01, 4.00000000e-01,
       4.00000000e-01, 5.00000000e-01, 4.00000000e-01, 3.00000000e-01,
       6.00000000e-01, 3.00000000e-01, 1.00000000e-01, 2.00000000e-01,
       1.00000000e-01, 2.00000000e-01])

In [19]:
scaler.scale_

array([5.62138773e+00, 3.57345149e+04, 5.00000000e-01, 4.89897949e-01,
       4.89897949e-01, 5.00000000e-01, 4.89897949e-01, 4.58257569e-01,
       4.89897949e-01, 4.58257569e-01, 3.00000000e-01, 4.00000000e-01,
       3.00000000e-01, 4.00000000e-01])

In [20]:
X_scaled = scaler.transform(X)
print('X_scaled \n', X_scaled)

X_scaled 
 [[-1.30454146e+00 -1.05562430e+00  1.00000000e+00 -8.16496581e-01
   1.22474487e+00 -1.00000000e+00  1.22474487e+00  1.52752523e+00
  -1.22474487e+00 -6.54653671e-01 -3.33333333e-01 -5.00000000e-01
   3.00000000e+00 -5.00000000e-01]
 [-9.48757423e-01 -9.57679776e-01  1.00000000e+00 -8.16496581e-01
  -8.16496581e-01 -1.00000000e+00  1.22474487e+00 -6.54653671e-01
   8.16496581e-01  1.52752523e+00 -3.33333333e-01 -5.00000000e-01
  -3.33333333e-01 -5.00000000e-01]
 [ 1.00805476e+00  1.54690159e+00 -1.00000000e+00 -8.16496581e-01
   1.22474487e+00  1.00000000e+00 -8.16496581e-01  1.52752523e+00
  -1.22474487e+00 -6.54653671e-01 -3.33333333e-01 -5.00000000e-01
  -3.33333333e-01  2.00000000e+00]
 [ 6.52270728e-01 -3.28036417e-01 -1.00000000e+00  1.22474487e+00
  -8.16496581e-01  1.00000000e+00 -8.16496581e-01 -6.54653671e-01
   8.16496581e-01 -6.54653671e-01 -3.33333333e-01 -5.00000000e-01
  -3.33333333e-01  2.00000000e+00]
 [-1.30454146e+00 -2.03611484e-16 -1.00000000e+00  1.2247

In [21]:
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
X_minmax

array([[0.        , 0.        , 1.        , 0.        , 1.        ,
        0.        , 1.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.        ],
       [0.11111111, 0.03153153, 1.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.72222222, 0.83783784, 0.        , 0.        , 1.        ,
        1.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        ],
       [0.61111111, 0.23423423, 0.        , 1.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 1.        ],
       [0.        , 0.33983984, 0.        , 1.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.33333333, 0.41441441, 0. 