# Regression Evaluation metrics

1. R2 score (Coeff of determinatino)
2. MAE
3. MSE

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

df = pd.DataFrame(housing.data, columns = housing.feature_names)
df[housing.target_names[0]] = housing.target
df.rename(columns={'MedHouseVal':'target'},inplace=True); df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
x = df.drop('target',axis=1)
y = df.target

from sklearn.model_selection import train_test_split
np.random.seed(42)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor().fit(x_train,y_train)

## 1. R2 sore

In [5]:
model.score(x_train,y_train)

0.9736801960414609

In [6]:
model.score(x_test,y_test)

0.8065734772187598

In [11]:
y_pred = model.predict(x_test)

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8065734772187598

## 2. MAE

- mean of absolute (positive) difference between predicted & actual values

- gives idea about how wrong your model predictions are

- has same unit as target variable

In [14]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred)

0.32659871732073664

In [22]:
mae = pd.DataFrame(data={'actual':y_test, 'predicted':y_pred})
mae['difference'] = np.abs(mae.actual - mae.predicted)
mae.head()

Unnamed: 0,actual,predicted,difference
20046,0.477,0.49384,0.01684
3024,0.458,0.75494,0.29694
15663,5.00001,4.928596,0.071414
20484,2.186,2.54316,0.35716
9814,2.78,2.33176,0.44824


In [23]:
mae.difference.mean()

0.32659871732073803

## 3. MSE

- mean of squares of difference between actual & predicted
- unit is squared

In [20]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

0.2534678520824551

In [24]:
mse = pd.DataFrame(data={'actual':y_test, 'predicted':y_pred})
mse['squared error'] = np.square(mse.actual - mse.predicted)
mse.head()

Unnamed: 0,actual,predicted,squared error
20046,0.477,0.49384,0.000284
3024,0.458,0.75494,0.088173
15663,5.00001,4.928596,0.0051
20484,2.186,2.54316,0.127563
9814,2.78,2.33176,0.200919


In [25]:
mse['squared error'].mean()

0.25346785208245565

## Using `scoring()` parameter

In [28]:
df = pd.read_csv('data/heart-disease.csv')
x = df.drop('target',axis=1)
y = df.target

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [36]:
#cross validation accuracy

print(cross_val_score(clf, x, y, cv=5, scoring = None)) #default scoring is 'Accuracy'

[0.83606557 0.90163934 0.78688525 0.81666667 0.73333333]


In [35]:
# precision

print(cross_val_score(clf, x, y, cv = 5, scoring='precision'))

[0.83333333 0.90322581 0.83333333 0.84375    0.73170732]


In [37]:
# recall
 
print(cross_val_score(clf, x, y, cv=5, scoring='recall'))

[0.87878788 0.87878788 0.81818182 0.81818182 0.81818182]


In [42]:
### Scoring for regression

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

df = pd.DataFrame(housing.data, columns = housing.feature_names)
df[housing.target_names[0]] = housing.target
df.rename(columns={'MedHouseVal':'target'},inplace=True); df.head()

x = df.drop('target',axis=1)
y = df.target

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [44]:
print(cross_val_score(model, x, y, cv = 3, scoring=None))

[0.61767391 0.72400935 0.62408617]


In [46]:
#MAE - 'neg_mean_absolute_error'. Negative because MAE should be less as possible for better model. But in case of scoring to
# keep the convention of higher the value better the model, we use negative

print(cross_val_score(model, x, y, cv = 3, scoring='neg_mean_absolute_error'))

[-0.53833043 -0.40754851 -0.43595456 -0.46162923 -0.47215927]


In [47]:
print(cross_val_score(model, x, y, cv = 3, scoring='neg_mean_squared_error'))

[-0.517683   -0.3298377  -0.54144429]


In [48]:
print(cross_val_score(model, x, y, cv = 3, scoring='neg_root_mean_squared_error'))

[-0.71636552 -0.57445457 -0.73011319]
