In [1]:
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')
%config InlineBackend.figure_format = 'retina'
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

Открываем фалй и собираем его в таком виде:\begin{equation*}
X = \begin{bmatrix}
     x^{(1)}_1 & x^{(1)}_2 & ... & x^{(1)}_M \\
     ... \\
     x^{(N)}_1 & x^{(N)}_2 & ... & x^{(N)}_M \\
    \end{bmatrix}, 
    \vec{y} =  \begin{bmatrix} y_1 \\ ... \\ y_N \end{bmatrix}
\end{equation*}
, где $M$ - количество признаков (атрибутов), используемых моделью, а $N$ - количество экземпляров данных.

In [2]:
file = open("california_housing.csv")
lines = [line.rstrip().split(',') for line in file]

dataset = {}
dataset['feature_names'] = [
    "longitude",
    "latitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "ocean_proximity_is_1h_ocean",
    "ocean_proximity_is_inland",
    "ocean_proximity_is_island",
    "ocean_proximity_is_near_bay",
    "ocean_proximity_is_near_ocean"]




Убераем названия признаков и отделяем признак близости к океану, обработаем его отдельно

In [3]:
lines = lines[1:]
data = [line[:-1] for line in lines]

Парсим, а отсутствующие данные заменяем на -1

In [4]:
def float_or_null(x):
    a = None
    try:
        a = float(x)
    except:
        a = -1
    return a

In [5]:
data = [[float_or_null(x) for x in line] for line in data]

In [6]:
data_proximity = [line[-1] for line in lines]
proximity_values = sorted(list(set(data_proximity)))

Приводим признак близости к океану к виду:  \begin{equation*} [*,*,*,*,*] , * \in \{0,1\} \end{equation*}

In [7]:
data_proximity = [[int(value == entry) for value in proximity_values] for entry in data_proximity]

Отделяем медианную стоимость и делаем её целевой

In [8]:
target_index = 8
clean_data = [line[:target_index] + line[(target_index + 1):] for line in data]
target = [line[target_index] for line in data]

In [9]:
dataset['data'] = np.array([clean_data[i] + data_proximity[i] for i in range(len(data))])
dataset['target'] = np.array(target)

dataset без учёта географических координат(ng - no geo)

In [10]:
dataset_ng = {}
dataset_ng['feature_names'] = np.array(dataset['feature_names'][2:])
dataset_ng['data'] = np.array([x[2:] for x in dataset['data']])
dataset_ng['target'] = dataset['target']

Разделяем на тестовые и тренеровочные данные

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset['data'], dataset['target'])
X_train_ng, X_test_ng, Y_train_ng, Y_test_ng = train_test_split(dataset_ng['data'], dataset_ng['target'])

Попробуем натренировать LinearRegression, Ridge и Lasso.

In [12]:
linreg = linear_model.LinearRegression()
linreg.fit(X_train,Y_train)
lin_pred = linreg.predict(X_test)

linreg_ng = linear_model.LinearRegression()
linreg_ng.fit(X_train_ng,Y_train_ng)
lin_pred_ng = linreg_ng.predict(X_test_ng)

print("Score:", linreg.score(X_test,Y_test))
print("Score_ng:", linreg_ng.score(X_test_ng,Y_test_ng))

Score: 0.6501372341003854
Score_ng: 0.6356162136126167


In [13]:
ridge = linear_model.Ridge()
ridge.fit(X_train,Y_train)

ridge_ng = linear_model.Ridge()
ridge_ng.fit(X_train_ng,Y_train_ng)


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [14]:
ridge_pred = linreg.predict(X_test)
ridge_pred_ng = ridge_ng.predict(X_test_ng)

print("Score:", ridge.score(X_test,Y_test))
print("Score_ng:", ridge_ng.score(X_test_ng,Y_test_ng))

Score: 0.650192618942009
Score_ng: 0.635783126805601


In [15]:
lasso = linear_model.Lasso(max_iter = 10000)
lasso.fit(X_train,Y_train)

lasso_ng = linear_model.Lasso(max_iter = 10000)
lasso_ng.fit(X_train_ng,Y_train_ng)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [16]:
lasso_pred = lasso.predict(X_test)
lasso_pred_ng = lasso_ng.predict(X_test_ng)

print("Score:", lasso.score(X_test,Y_test))
print("Score_ng:", lasso_ng.score(X_test_ng,Y_test_ng))

Score: 0.6501489888452501
Score_ng: 0.6356407249745393


Как видим точность ~65, что достаточно мало, попробуем RandomForestRegressor.

In [17]:
forest = RandomForestRegressor(n_estimators=15, random_state=0, max_features=10)
forest.fit(X_train, Y_train)

forest_ng = RandomForestRegressor(n_estimators=15, random_state=0, max_features=10)
forest_ng.fit(X_train_ng, Y_train_ng)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=15, n_jobs=None, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [18]:
print("Score:", forest.score(X_test,Y_test))
print("Score_ng:", forest_ng.score(X_test_ng,Y_test_ng))

Score: 0.8223281299272607
Score_ng: 0.6981847640147326


В итоге точность RandomForestRegressor ~82, учитывая геолокацию, без неё ~70, что всё равно лучше, чем линейные модели. Далее посмторим какой из признаков наиболее информативен.

In [19]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

In [20]:
print("Feature ranking of model with geo:")

for f in range(dataset['data'].shape[1]):
    print(f + 1,") ", dataset['feature_names'][indices[f]]," - ", importances[indices[f]],sep = '')

Feature ranking of model with geo:
1) median_income - 0.4532512715544687
2) ocean_proximity_is_inland - 0.16143180075933403
3) longitude - 0.10900069789308349
4) latitude - 0.10254529106183334
5) housing_median_age - 0.04603905080246197
6) population - 0.03620958079193048
7) total_rooms - 0.028059442469577048
8) households - 0.024110795555547668
9) total_bedrooms - 0.022256056411603723
10) ocean_proximity_is_1h_ocean - 0.008743082038463442
11) ocean_proximity_is_near_ocean - 0.007042988458202159
12) ocean_proximity_is_near_bay - 0.0010955680066558596
13) ocean_proximity_is_island - 0.0002143741968380106


Получается, что самым информативным оказался медианный доход