https://machinelearningmastery.com/model-based-outlier-detection-and-removal-in-python/

In [10]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [11]:
import pandas as pd

pd.set_option( 'display.max_columns' , None ) 

In [12]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'

df = pd.read_csv( url , header = None )

In [13]:
df.head( 5 )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [14]:
df.shape

(506, 14)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
0     506 non-null float64
1     506 non-null float64
2     506 non-null float64
3     506 non-null int64
4     506 non-null float64
5     506 non-null float64
6     506 non-null float64
7     506 non-null float64
8     506 non-null int64
9     506 non-null float64
10    506 non-null float64
11    506 non-null float64
12    506 non-null float64
13    506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [16]:
data = df.values

X = data[ : , : -1 ]

X.shape

y = data[ : , -1 ]

y.shape

(506, 13)

(506,)

# Baseline Model Performance

In [43]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X , y , test_size = 0.33 , random_state = 999 )

In [41]:
from sklearn.linear_model import LinearRegression

In [42]:
model = LinearRegression()

In [66]:
model.fit( X_train , y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [68]:
yhat_test = model.predict( X_test )

In [69]:
from sklearn.metrics import mean_absolute_error

In [70]:
mae = mean_absolute_error( y_test, yhat_test )

In [72]:
print( f'MAE : {mae:.3f}' )

MAE : 3.782


# Isolation Forest

In [83]:
X_train, X_test, y_train, y_test = train_test_split( X , y , test_size = 0.33 , random_state = 999 )

In [84]:
from sklearn.ensemble import IsolationForest

In [85]:
isof = IsolationForest( contamination = 0.1 )

In [86]:
out = isof.fit_predict( X_train )

In [87]:
mask_out = out != -1

X_train =  X_train[ mask_out , : ]

y_train =  y_train[ mask_out ]

In [88]:
model.fit( X_train , y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [89]:
yhat_test = model.predict( X_test )

In [90]:
mae = mean_absolute_error( y_test, yhat_test )

In [91]:
print( f'MAE : {mae:.3f}' )

MAE : 3.742


# Local Outlier Factor

In [92]:
X_train, X_test, y_train, y_test = train_test_split( X , y , test_size = 0.33 , random_state = 999 )

In [93]:
from sklearn.neighbors import LocalOutlierFactor

In [94]:
lof = LocalOutlierFactor()

In [95]:
out = lof.fit_predict( X_train )

In [96]:
mask_out = out != -1

X_train =  X_train[ mask_out , : ]

y_train =  y_train[ mask_out ]

In [97]:
model.fit( X_train , y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [98]:
yhat_test = model.predict( X_test )

In [99]:
mae = mean_absolute_error( y_test, yhat_test )

In [100]:
print( f'MAE : {mae:.3f}' )

MAE : 3.723


# One-Class SVM

In [101]:
X_train, X_test, y_train, y_test = train_test_split( X , y , test_size = 0.33 , random_state = 999 )

In [102]:
from sklearn.svm import OneClassSVM

In [103]:
ocs = OneClassSVM( nu = 0.01 )

In [104]:
out = ocs.fit_predict( X_train )

In [105]:
mask_out = out != -1

X_train =  X_train[ mask_out , : ]

y_train =  y_train[ mask_out ]

In [106]:
model.fit( X_train , y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [107]:
yhat_test = model.predict( X_test )

In [108]:
mae = mean_absolute_error( y_test, yhat_test )

In [109]:
print( f'MAE : {mae:.3f}' )

MAE : 3.706
