In [3]:
pip list

Package                       Version
----------------------------- ---------------------
absl-py                       1.0.0
alabaster                     0.7.12
albumentations                0.1.12
altair                        4.2.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arviz                         0.11.4
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
atari-py                      0.2.9
atomicwrites                  1.4.0
attrs                         21.4.0
audioread                     2.1.9
autograd                      1.3
Babel                         2.9.1
backcall                      0.2.0
beautifulsoup4                4.6.3
bleach                        4.1.0
blis                          0.4.1
bokeh                         2.3.3
Bottleneck                    1.3.4
branca                        0.4.2
bs4                           0.0.1
CacheC

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
dataset = pd.read_csv('dataset.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [7]:
print(x)

[['M' 1972.0 8.0 1]
 ['M' 1981.0 14.0 2]
 ['F' 1977.0 6.0 2]
 ['F' 1961.0 8.0 6]
 ['M' nan 2.0 3]
 ['M' 1982.0 nan 4]
 ['M' 1981.0 11.0 2]
 ['M' nan 3.0 3]
 ['F' 1969.0 6.0 2]
 ['M' 1987.0 nan 1]
 ['F' 1959.0 12.0 5]]


In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:4])
x[:, 1:4] = imputer.transform(x[:, 1:4])

In [9]:
print(x)

[['M' 1972.0 8.0 1.0]
 ['M' 1981.0 14.0 2.0]
 ['F' 1977.0 6.0 2.0]
 ['F' 1961.0 8.0 6.0]
 ['M' 1974.3333333333333 2.0 3.0]
 ['M' 1982.0 7.777777777777778 4.0]
 ['M' 1981.0 11.0 2.0]
 ['M' 1974.3333333333333 3.0 3.0]
 ['F' 1969.0 6.0 2.0]
 ['M' 1987.0 7.777777777777778 1.0]
 ['F' 1959.0 12.0 5.0]]


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [11]:
print(x)

[[0.0 1.0 1972.0 8.0 1.0]
 [0.0 1.0 1981.0 14.0 2.0]
 [1.0 0.0 1977.0 6.0 2.0]
 [1.0 0.0 1961.0 8.0 6.0]
 [0.0 1.0 1974.3333333333333 2.0 3.0]
 [0.0 1.0 1982.0 7.777777777777778 4.0]
 [0.0 1.0 1981.0 11.0 2.0]
 [0.0 1.0 1974.3333333333333 3.0 3.0]
 [1.0 0.0 1969.0 6.0 2.0]
 [0.0 1.0 1987.0 7.777777777777778 1.0]
 [1.0 0.0 1959.0 12.0 5.0]]


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
print(y)

[0 0 1 0 0 1 1 1 0 1 0]


In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [15]:
print(x_train)

[[0.0 1.0 1987.0 7.777777777777778 1.0]
 [0.0 1.0 1981.0 14.0 2.0]
 [0.0 1.0 1981.0 11.0 2.0]
 [0.0 1.0 1972.0 8.0 1.0]
 [0.0 1.0 1974.3333333333333 3.0 3.0]
 [1.0 0.0 1959.0 12.0 5.0]
 [1.0 0.0 1969.0 6.0 2.0]
 [0.0 1.0 1982.0 7.777777777777778 4.0]]


In [16]:
print(x_test)

[[1.0 0.0 1977.0 6.0 2.0]
 [1.0 0.0 1961.0 8.0 6.0]
 [0.0 1.0 1974.3333333333333 2.0 3.0]]


In [17]:
print(y_train)

[1 0 1 0 1 0 0 1]


In [18]:
print(y_test)

[1 0 0]


In [19]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 4:] = sc.fit_transform(x_train[:, 4:])
x_test[:, 4:] = sc.transform(x_test[:,4:])

In [20]:
print(x_train)

[[0.0 1.0 1987.0 7.777777777777778 -1.1338934190276817]
 [0.0 1.0 1981.0 14.0 -0.3779644730092272]
 [0.0 1.0 1981.0 11.0 -0.3779644730092272]
 [0.0 1.0 1972.0 8.0 -1.1338934190276817]
 [0.0 1.0 1974.3333333333333 3.0 0.3779644730092272]
 [1.0 0.0 1959.0 12.0 1.889822365046136]
 [1.0 0.0 1969.0 6.0 -0.3779644730092272]
 [0.0 1.0 1982.0 7.777777777777778 1.1338934190276817]]


In [21]:
print(x_test)

[[1.0 0.0 1977.0 6.0 -0.3779644730092272]
 [1.0 0.0 1961.0 8.0 2.6457513110645903]
 [0.0 1.0 1974.3333333333333 2.0 0.3779644730092272]]
