In [143]:
from pandas import read_csv
from sklearn.model_selection import train_test_split

In [144]:
url = "datasets/housing.csv"
df = read_csv(url)

In [204]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [208]:
X.columns.to_list()

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT']

In [205]:
print(X.shape, y.shape)

(506, 13) (506,)


In [148]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [149]:
# summarize the shape of the train and test sets
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(339, 13) (167, 13) (339,) (167,)


In [150]:
# evaluate model on the raw dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [151]:
# split into input and output elements
X, y = data[:, :-1], data[:, -1]

In [152]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [153]:
# fit the model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [154]:
# evaluate the model
yhat = model.predict(X_test)

In [155]:
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

MAE: 3.417


In [156]:
import pickle
with open('models/baseline_model.pk','wb') as file:
    pickle.dump(model, file)

In [157]:
# evaluate model performance with outliers removed using isolation forest
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error

In [158]:
# split into input and output elements
X, y = data[:, :-1], data[:, -1]

In [159]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [160]:
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)

(339, 13) (339,)


In [161]:
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)

In [162]:
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

In [163]:
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(305, 13) (305,)


In [164]:
# fit the model
model_iso = LinearRegression()
model_iso.fit(X_train, y_train)

LinearRegression()

In [165]:
# evaluate the model
yhat = model_iso.predict(X_test)

In [166]:
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

MAE: 3.207


In [167]:
import pickle
with open('models/isolation_model.pk','wb') as file:
    pickle.dump(iso, file)

In [168]:
import pickle
with open('models/iso_linear_model.pk','wb') as file:
    pickle.dump(model_iso, file)

In [169]:
# evaluate model performance with outliers removed using elliptical envelope
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import mean_absolute_error

In [170]:
# split into input and output elements
X, y = data[:, :-1], data[:, -1]

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [172]:
print(X_train.shape, y_train.shape)

(339, 13) (339,)


In [173]:
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)

In [174]:
mask = yhat != -1

In [175]:
X_train, y_train = X_train[mask, :], y_train[mask]

In [176]:
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(335, 13) (335,)


In [177]:
model_ee = LinearRegression()
model_ee.fit(X_train, y_train)

LinearRegression()

In [178]:
# evaluate the model
yhat = model_ee.predict(X_test)

In [179]:
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

MAE: 3.388


In [180]:
import pickle
with open('models/Elliptic.pk','wb') as file:
    pickle.dump(ee, file)

In [181]:
import pickle
with open('models/Elliptic_linear_model.pk','wb') as file:
    pickle.dump(model_ee, file)

In [182]:
# evaluate model performance with outliers removed using local outlier factor
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error

In [183]:
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

In [184]:
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

In [185]:
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(302, 13) (302,)


In [186]:
# fit the model
model_lof = LinearRegression()
model_lof.fit(X_train, y_train)

LinearRegression()

In [187]:
# evaluate the model
yhat = model.predict(X_test)

In [188]:
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

MAE: 3.417


In [189]:
import pickle
with open('models/localOutliner.pk','wb') as file:
    pickle.dump(lof, file)

In [190]:
import pickle
with open('models/lof_linear_model.pk','wb') as file:
    pickle.dump(model_lof, file)

In [191]:
# evaluate model performance with outliers removed using one class SVM
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import OneClassSVM
from sklearn.metrics import mean_absolute_error

In [192]:
OneCSVM = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)

In [193]:
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

In [194]:
# summarize the shape of the updated training dataset
print(X_train.shape, y_train.shape)

(298, 13) (298,)


In [195]:
# fit the model
model_SVM = LinearRegression()

In [196]:
model_SVM.fit(X_train, y_train)

LinearRegression()

In [197]:
# evaluate the model
yhat = model_SVM.predict(X_test)

In [198]:
mae = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % mae)

MAE: 3.207


In [199]:
import pickle
with open('models/OneclassSVM.pk','wb') as file:
    pickle.dump(OneCSVM, file)

In [200]:
import pickle
with open('models/SVM_linear_model.pk','wb') as file:
    pickle.dump(OneCSVM, file)

In [201]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677082,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [202]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0
