In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil import parser
import opendatasets as od
from datetime import datetime, date, time
from matplotlib.image import imread
%matplotlib inline

###  **How to determine what model to use = Sample dataset from sklearn website**

### Importing California housing dataset from  https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html#sklearn.datasets.fetch_california_housing

In [2]:
# get california housing dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [3]:
# turn it to a df
housing_df = pd.DataFrame(housing['data'], columns =housing['feature_names'])
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
housing_df['target'] = housing['target']
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [5]:
# checking NaN values in the data set
housing_df.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [6]:
# import algorithm / estimator
from sklearn.linear_model import Ridge

#set random seed
np.random.seed(42)

# create the data X and y
X= housing_df.drop('target', axis = 1)
y = housing['target']  # the median house price in $100,000s

# Train model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.2)

# fit the model
model = Ridge()
model.fit(X_train, y_train)

# evaluate score
print ( f'train score = [{model.score(X_train, y_train)}], and test score = [ {model.score(X_test, y_test)}]')

train score = [0.6125511245209703], and test score = [ 0.5758549611440126]


# **what if Ridge didnt work or dint fit our needs?, we could try other models..**

# lets try ensemble regression model

In [7]:
# import algorithm / estimator
from sklearn.ensemble import RandomForestRegressor

# create the data X and y
X=  housing_df.drop('target', axis = 1)
y = housing['target']  # the median house price in $100,000s

# Train model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.2)

# fit the model (training the machine learning model)
model = RandomForestRegressor(n_estimators = 100, n_jobs = 1, random_state = 42)
model.fit(X_train, y_train)

# evaluate score
print(f'Train score = [{model.score(X_train, y_train)}] \nTest score =[{model.score(X_test, y_test)}]')

Train score = [0.9734970506814189] 
Test score =[0.8054365101722766]


In [8]:
# Make predictions
# 1. predict()    => returns single value for each prediction
y_preds = model.predict(X_test) # always predict on X_test
y_preds[:20]

array([0.66244  , 2.68722  , 5.00001  , 3.4963003, 2.9893002, 0.94271  ,
       1.26244  , 1.5465   , 3.4764602, 0.84571  , 2.46224  , 3.13458  ,
       1.12451  , 1.59883  , 1.61441  , 2.3809401, 2.78821  , 2.9550301,
       3.1108601, 2.46785  ])

In [9]:
np.array(y_test[:20])

array([0.696  , 3.356  , 5.00001, 3.606  , 2.766  , 0.835  , 1.22   ,
       1.693  , 3.153  , 0.789  , 2.182  , 3.24   , 1.292  , 1.313  ,
       1.316  , 2.148  , 2.896  , 3.476  , 5.00001, 2.725  ])

In [10]:
# Compare predictions to truth labels to evaluate the model
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

0.3222107616279071

## **Evaluating a Model with a Score** 

There are three ways of achieving this
1. The estimator's inbuilt score() method
2. The scoring parameter
3. Problem - specific metric functions

In [11]:
# import the prediction model
from sklearn.ensemble import RandomForestRegressor

# create seed
np.random.seed(42)

#create X and y
X = housing_df.drop('target', axis = 1)
y = housing_df['target']

#train and split model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# fit the model
model = RandomForestRegressor()
model.fit(X_test, y_test)



# Evaluation of the model
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error

# inbuilt score 
print(f'Training Score:{model.score(X_train, y_train)}') # inbuilt accuracy score
print(f'Test Score: {model.score(X_test, y_test)}')     # inbuilt accuracy score

print()
# cross validation score
from sklearn.model_selection import cross_val_score
print(f"Classifier Cross Validation_Score: {cross_val_score(model,X,y,cv= 5)}") # cross validation
print(f"Classifier Cross Validation_Score_Mean: {np.mean(cross_val_score(model,X,y,cv= 5))}") # cross validation

print()
# Mean Absolute error
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_test)
mean_absolute_error = mean_absolute_error(y_test, y_pred)
print(f'mean_absolute_error : {mean_absolute_error}')

# Mean Squared Values
from sklearn.metrics import mean_squared_error
mean_squared_error = mean_squared_error(y_test, y_pred)
print(f'mean_squared_error : {mean_squared_error}')

Training Score:0.768178011521409
Test Score: 0.9652739672763454

Classifier Cross Validation_Score: [0.50097272 0.70273041 0.74115332 0.61556377 0.68253631]
Classifier Cross Validation_Score_Mean: 0.6481235867027331

mean_absolute_error : 0.1429742775193802
mean_squared_error : 0.04550530505976422
