<a href="https://colab.research.google.com/github/elyorbek8/ML_journey/blob/main/step7_training_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine** learning training process

In [40]:
import pandas as pd
import numpy as np
import sklearn

In [41]:
df = pd.read_csv('https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [43]:
# data set spliting
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size= 0.2, random_state= 42)

In [44]:
# data, labels, and numerical data
x_train = train_set.drop('median_house_value', axis= 1)
y_train = train_set['median_house_value'].copy()

x_test = test_set.drop('median_house_value', axis= 1)
y_test = test_set['median_house_value'].copy()

x_num = x_train.drop('ocean_proximity', axis= 1)

In [45]:
# creating a transformer, which adds extra attributes
from sklearn.base import BaseEstimator, TransformerMixin

# indices
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class AttributeAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_household = True):
    self.add_bedrooms_per_household = add_bedrooms_per_household

  def fit(self, X, y= None):
    return self

  def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_household:
      bedrooms_per_household = X[:, bedrooms_ix] / X[:, households_ix]
      return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_household]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

In [46]:
# creating a pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [47]:
# numeric pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy= 'median')),
    ('attr_adder', AttributeAdder()),
    ('std_scaler', StandardScaler())
])

num_pipeline.fit_transform(x_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.20836543],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222, -0.12853018],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.25753771],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568, -0.03921583],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.06626528],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429, -0.08750798]])

In [49]:
# nested pipeline
from sklearn.compose import ColumnTransformer

num_attr = list(x_num)
cat_attr = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attr),
    ('cat', OneHotEncoder(), cat_attr)
])

In [50]:
x_prepared = full_pipeline.fit_transform(x_train)

In [51]:
x_prepared[:2, :]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.20836543,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
        -0.12853018,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ]])

In [52]:
# training a model
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [53]:
LR_model.fit(x_prepared, y_train)

In [54]:
# testing manually
x_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.8,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.725,NEAR OCEAN


In [55]:
y_test.head()

Unnamed: 0,median_house_value
20046,47700.0
3024,45800.0
15663,500001.0
20484,218600.0
9814,278000.0


In [56]:
x_test_prepared = full_pipeline.fit_transform(x_test)
x_test_prepared

array([[ 0.25541734,  0.22194113, -0.30073951, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02976613, -0.20947715,  0.098724  , ...,  0.        ,
         0.        ,  0.        ],
       [-1.46454628,  1.03788441,  1.85636346, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.2689819 ,  0.80810728, -0.30073951, ...,  0.        ,
         0.        ,  0.        ],
       [-0.120668  ,  0.5548835 ,  0.57808022, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.57634349, -0.64089543, -0.93988113, ...,  0.        ,
         0.        ,  0.        ]])

In [58]:
predicted_labels = LR_model.predict(x_test_prepared)
predicted_labels

array([ 54691.46757953, 130233.08980728, 280993.4082424 , ...,
       450595.94517116, 122343.20085418, 187294.90340789])

In [59]:
# printing predicted and real values
pred_show = pd.DataFrame({'Predicted': predicted_labels.round(), 'Real': y_test})
pred_show.head()

Unnamed: 0,Predicted,Real
20046,54691.0,47700.0
3024,130233.0,45800.0
15663,280993.0,500001.0
20484,272440.0,218600.0
9814,264856.0,278000.0


# Evaluating the model by the metrics - **MAE** (Mean Absolute Error) and **RMSE** (Root Mean Square Error).

In [60]:
from sklearn.metrics import mean_absolute_error

mea = mean_absolute_error(y_true= y_test, y_pred= predicted_labels)
print('MAE = ', mea)

MAE =  51119.612648721195


In [61]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_true= y_test, y_pred= predicted_labels)
print('RMSE = ', np.sqrt(mse))

RMSE =  70999.91233993605


# Developing a Random Forest model

In [62]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(x_prepared, y_train)

In [65]:
predicted_labels = RF_model.predict(x_test_prepared)

In [66]:
# evaluation
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_true= y_test, y_pred= predicted_labels)
print('RMSE = ', np.sqrt(mse))

RMSE =  78364.75639241589


In [67]:
mea = mean_absolute_error(y_true= y_test, y_pred= predicted_labels)
print('MAE = ', mea)

MAE =  55392.24813468992


# Cross Validation: LR and RF model

In [68]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [69]:
x = df.drop('median_house_value', axis= 1)
y = df['median_house_value'].copy()

x_prepared = full_pipeline.transform(x)

In [70]:
# LR_model cross validation
from sklearn.model_selection import cross_val_score

LR_mse_scores = cross_val_score(LR_model, x_prepared, y, scoring= 'neg_mean_squared_error', cv= 5)

In [71]:
def show_scores(scores):
  print('Score:', scores)
  print('Mean:', scores.mean())
  print('std:', scores.std())

In [72]:
show_scores(np.sqrt(-LR_mse_scores))

Score: [74099.47976417 75575.12285366 75674.92999548 77182.92383357
 66365.53985717]
Mean: 73779.59926080857
std: 3833.2597639866744


In [73]:
# RF_model cross validation
RF_mse_scores = cross_val_score(RF_model, x_prepared, y, scoring= 'neg_mean_squared_error', cv= 10)
RF_rmse_scores = np.sqrt(-RF_mse_scores)

show_scores(RF_rmse_scores)

Score: [96958.25086372 47403.65578902 65394.23191558 56104.84806279
 60792.51868345 59869.55691408 47954.68219557 78619.66922465
 73893.76324866 49402.62421071]
Mean: 63639.380110823724
std: 14947.130695176176


# Saving the models

# 1. Saving as a pickle file

In [77]:
import pickle

filename = 'RF_model.pkl'
with open(filename, 'wb') as file:
  pickle.dump(RF_model, file)

In [78]:
filename = 'LR_model.pkl'
with open(filename, 'wb') as file:
  pickle.dump(LR_model, file)

In [80]:
# opening the file
with open('LR_model.pkl', 'rb') as file:
  model = pickle.load(file)

In [81]:
# cross validation as a check
from sklearn.model_selection import cross_val_score

LR_mse_scores = cross_val_score(model, x_prepared, y, scoring= 'neg_mean_squared_error', cv= 5)

In [82]:
show_scores(np.sqrt(-LR_mse_scores))

Score: [74099.47976417 75575.12285366 75674.92999548 77182.92383357
 66365.53985717]
Mean: 73779.59926080857
std: 3833.2597639866744


# 2. Saving as a joblib file

In [83]:
import joblib

filename = 'RF_model.jbl'
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [84]:
model = joblib.load(filename)

In [85]:
RF_mse_scores = cross_val_score(model, x_prepared, y, scoring= 'neg_mean_squared_error', cv= 5)
RF_rmse_scores = np.sqrt(-RF_mse_scores)

show_scores(RF_rmse_scores)

Score: [77130.08120166 64373.16353415 60845.46306636 79515.29648693
 62380.93312134]
Mean: 68848.98748208681
std: 7852.028926063619
