In [1]:
!pip install tensorflow scikeras scikit-learn

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [15]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# load dataset
dataset=pd.read_csv("https://raw.githubusercontent.com/enuguru/aiandml/master/machine_learning_algorithms_using_frameworks/python_files/regression/salary_regression/multiple_linear_regression/Salary%20Data.csv")
print(dataset.head())

print(type(dataset))
print(dataset.size)

    Age  Gender Education Level          Job Title  Years of Experience  \
0  32.0    Male      Bachelor's  Software Engineer                  5.0   
1  28.0  Female        Master's       Data Analyst                  3.0   
2  45.0    Male             PhD     Senior Manager                 15.0   
3  36.0  Female      Bachelor's    Sales Associate                  7.0   
4  52.0    Male        Master's           Director                 20.0   

     Salary  
0   90000.0  
1   65000.0  
2  150000.0  
3   60000.0  
4  200000.0  
<class 'pandas.core.frame.DataFrame'>
2250


In [16]:
pd.unique(dataset["Job Title"])
dataset=dataset.drop("Job Title",axis=1)
pd.unique(dataset["Education Level"])

new_dataset=pd.get_dummies(dataset,columns=["Gender","Education Level"],dtype=int)
print(new_dataset.head())

new_dataset.isnull().sum()
new_dataset=new_dataset.dropna()
new_dataset.isnull().sum()

    Age  Years of Experience    Salary  Gender_Female  Gender_Male  \
0  32.0                  5.0   90000.0              0            1   
1  28.0                  3.0   65000.0              1            0   
2  45.0                 15.0  150000.0              0            1   
3  36.0                  7.0   60000.0              1            0   
4  52.0                 20.0  200000.0              0            1   

   Education Level_Bachelor's  Education Level_Master's  Education Level_PhD  
0                           1                         0                    0  
1                           0                         1                    0  
2                           0                         0                    1  
3                           1                         0                    0  
4                           0                         1                    0  


Age                           0
Years of Experience           0
Salary                        0
Gender_Female                 0
Gender_Male                   0
Education Level_Bachelor's    0
Education Level_Master's      0
Education Level_PhD           0
dtype: int64

In [5]:
#Input features and output
y=new_dataset["Salary"]
X=new_dataset.drop("Salary",axis=1)
print(X.shape)
print(y.shape)

(373, 7)
(373,)


In [7]:
#Train and Test Split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.1)
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(335, 7)
(335, 1)
(38, 7)
(38, 1)


In [8]:
#Scaling input and output
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
print(X_train_scaled)
y_train = y_train.reshape(-1, 1)
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
print(y_train_scaled)
X_test_scaled = scaler_X.transform(X_test)
y_test = y_test.reshape(-1, 1)
y_test_scaled = scaler_y.transform(y_test)

[[ 0.78565268  1.21779517  1.02723645 ... -1.25575598 -0.59228916
   2.60589671]
 [-0.91328426 -0.91772898 -0.97348571 ...  0.79633306 -0.59228916
  -0.38374506]
 [-1.05486233 -1.22280386  1.02723645 ...  0.79633306 -0.59228916
  -0.38374506]
 ...
 [-0.06381579 -0.30757922 -0.97348571 ...  0.79633306 -0.59228916
  -0.38374506]
 [ 0.92723075  0.91272029  1.02723645 ... -1.25575598 -0.59228916
   2.60589671]
 [ 1.63512114  1.37033261  1.02723645 ... -1.25575598  1.68836451
  -0.38374506]]
[[ 0.8207572 ]
 [-1.03708664]
 [-1.24351373]
 [-0.00495117]
 [ 3.09145523]
 [ 1.44003848]
 [-1.24351373]
 [ 0.8207572 ]
 [ 0.40790302]
 [-0.727446  ]
 [-0.93387309]
 [-1.24351373]
 [ 1.64646558]
 [-0.21137826]
 [ 0.61433011]
 [-0.00495117]
 [ 0.20147592]
 [ 0.61433011]
 [ 1.44003848]
 [-0.83065954]
 [ 1.64646558]
 [-0.00495117]
 [-0.00495117]
 [ 1.0271843 ]
 [ 1.64646558]
 [-0.5210189 ]
 [-1.14030018]
 [-1.24351373]
 [-1.14030018]
 [ 0.20147592]
 [-1.24351373]
 [-1.44994082]
 [-0.5210189 ]
 [-0.21137826

In [9]:
#Baseline model
# define baseline model
def baseline_model():
  # create model
  model = Sequential()
  model.add(Dense(16, input_dim=7, activation='relu'))
  model.add(Dense(8, activation='relu'))
  model.add(Dense(4, activation='relu'))
  model.add(Dense(1, kernel_initializer='normal', activation='linear'))
  # Compile model
  model.compile(loss='mean_squared_error', optimizer='adam')
  return model

In [10]:
#Estimator
estimator = KerasRegressor(model=baseline_model, epochs=200, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
cv_results = cross_validate(estimator, X_train_scaled, y_train_scaled, cv=kfold, return_estimator=True, scoring='neg_mean_squared_error')
print(cv_results['test_score'].mean())

-0.10263064348037274


In [11]:
#Validation
yhat = []
for i in range(len(cv_results['estimator'])):
  yhat.append(cv_results['estimator'][i].predict(X_test_scaled))

yhat_decoded = []
for estimates in yhat:
  #print(estimates)
  yhat_decoded.append(scaler_y.inverse_transform(estimates))

In [13]:
pred_final = []
for i in range(len(y_test)):
  sum = 0
  for j in range(0, 10):
    sum += yhat_decoded[j][i]
  pred_final.append(sum/10)

for index in range(len(y_test)):
  print(y_test[index], pred_final[index])

[180000.] [177976.73]
[65000.] [98376.19]
[125000.] [129249.3]
[80000.] [90105.805]
[140000.] [153553.78]
[160000.] [154801.86]
[160000.] [134692.25]
[120000.] [109809.61]
[50000.] [49902.5]
[95000.] [95987.08]
[140000.] [139731.27]
[160000.] [165116.42]
[35000.] [42040.664]
[95000.] [98473.49]
[110000.] [131376.72]
[50000.] [49902.5]
[90000.] [65719.28]
[50000.] [50937.914]
[90000.] [83627.36]
[95000.] [104319.81]
[110000.] [115126.625]
[105000.] [110952.14]
[190000.] [158390.2]
[70000.] [59484.395]
[100000.] [90105.805]
[50000.] [50401.008]
[160000.] [165120.92]
[40000.] [40478.945]
[120000.] [105094.46]
[170000.] [157651.47]
[45000.] [45291.133]
[100000.] [115572.586]
[60000.] [48829.65]
[150000.] [150310.72]
[110000.] [97493.28]
[35000.] [35377.99]
[180000.] [175030.89]
[50000.] [41602.965]


In [14]:
#RMS SCORE
from sklearn.metrics import r2_score
print(r2_score(y_test, pred_final))

0.927387944903478
