In [66]:
import numpy as np
import pandas as pd

### Dataset

In [67]:
dataset = pd.read_csv("employee_data.csv")

print(dataset.head())

   Unnamed: 0  id groups  age  healthy_eating  active_lifestyle  salary
0           0   0      A   36               5                 5    2297
1           1   1      A   55               3                 5    1134
2           2   2      A   61               8                 1    4969
3           3   3      O   29               3                 6     902
4           4   4      O   34               6                 2    3574


In [68]:
dataset = dataset.drop("Unnamed: 0", 1)
dataset = dataset.drop("id", 1)

print(dataset.head())

  groups  age  healthy_eating  active_lifestyle  salary
0      A   36               5                 5    2297
1      A   55               3                 5    1134
2      A   61               8                 1    4969
3      O   29               3                 6     902
4      O   34               6                 2    3574


In [69]:
dataset = dataset.to_numpy()
print(dataset)

[['A' 36 5 5 2297]
 ['A' 55 3 5 1134]
 ['A' 61 8 1 4969]
 ...
 ['O' 49 9 7 4158]
 ['AB' 56 6 7 2414]
 ['B' 64 4 9 788]]


In [70]:
x = dataset[:, :-1]
y = dataset[:, -1]

print(x[0], y[0])
print(x.shape, y.shape)

['A' 36 5 5] 2297
(1000, 4) (1000,)


### Model

In [71]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [72]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [73]:
regr = RandomForestRegressor(n_estimators=100)

In [74]:
categorical_columns = [0]
numerical_columns = [1, 2, 3]

numeric_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)
categorical_transformer = Pipeline(
    steps=[
        ('onehot', OrdinalEncoder())
    ]
)

preprocessor_ordinal  = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    )
)

In [75]:
preprocessor_ordinal.fit(x_train)
x_trans = preprocessor_ordinal.transform(x_train)

print(x_train)
print(x_trans)

[['AB' 37 8 5]
 ['A' 42 5 6]
 ['O' 30 3 5]
 ...
 ['A' 33 3 6]
 ['A' 26 7 8]
 ['O' 54 6 5]]
[[-0.31331871  1.48039997 -0.29590067  1.        ]
 [ 0.05665952  0.00771407  0.19492982  0.        ]
 [-0.83128823 -0.97407653 -0.29590067  3.        ]
 ...
 [-0.60930129 -0.97407653  0.19492982  0.        ]
 [-1.12727081  0.98950467  1.17659081  0.        ]
 [ 0.94460728  0.49860937 -0.29590067  3.        ]]


In [76]:
pipe_ordinal = Pipeline(
    steps=[
        ('preprocessor_ordinal', preprocessor_ordinal),
        ('regressor', regr)
    ]
)

In [77]:
pipe_ordinal.fit(x_train, y_train)
score = pipe_ordinal.score(x_test, y_test)

print(f"Score: {score}")

Score: 0.997092780711884


In [78]:
import pickle

pickle.dump(regr, open("model.pkl", "wb"))
regr = pickle.load(open("model.pkl", "rb"))

sample = np.array([0, 40, 10, 10])
print(regr.predict([sample]))

[4267.59]
