#### Задание 1

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
boston = load_boston()
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [3]:
data = boston.data
feature_names = boston.feature_names
target = boston.target

In [4]:
X = pd.DataFrame(data, columns=feature_names)
print(X.head())
print(X.info())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  
0     15.3  396.90   4.98  
1     17.8  396.90   9.14  
2     17.8  392.83   4.03  
3     18.7  394.63   2.94  
4     18.7  396.90   5.33  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64


In [5]:
y = pd.DataFrame(target, columns=['Price'])
print(y.head())
print(y.info())

   Price
0   24.0
1   21.6
2   34.7
3   33.4
4   36.2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 1 columns):
Price    506 non-null float64
dtypes: float64(1)
memory usage: 4.0 KB
None


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
y_predict = lr.predict(X_test)

In [8]:
compare_ds = pd.DataFrame({
    'y_test': y_test['Price'],
    'y_pred': y_predict.flatten()
}, columns=['y_test', 'y_pred'])
compare_ds['error'] = compare_ds.y_pred - compare_ds.y_test

In [9]:
compare_ds.head()

Unnamed: 0,y_test,y_pred,error
173,23.6,28.655072,5.055072
274,32.4,36.50209,4.10209
491,13.6,15.418775,1.818775
72,22.8,25.412343,2.612343
452,16.1,18.845041,2.745041


In [10]:
r2 = r2_score(compare_ds.y_test, compare_ds.y_pred)
r2

0.7109203586326303

#### Задание 2

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
model = RandomForestRegressor(max_depth=12, random_state=42, n_estimators=1000)

In [13]:
model.fit(X_train, y_train.values[:, 0])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [14]:
y_predict = model.predict(X_test)

In [15]:
compare_ds = pd.DataFrame({
    'y_test': y_test['Price'],
    'y_pred': y_predict.flatten()
}, columns=['y_test', 'y_pred'])
compare_ds['error'] = compare_ds.y_pred - compare_ds.y_test
compare_ds.head()

Unnamed: 0,y_test,y_pred,error
173,23.6,22.840025,-0.759975
274,32.4,31.153814,-1.246186
491,13.6,16.225629,2.625629
72,22.8,23.821036,1.021036
452,16.1,17.160048,1.060048


In [16]:
r2_forest = r2_score(compare_ds.y_test, compare_ds.y_pred)
r2_forest

0.8758073546581215

In [17]:
r2, r2_forest

(0.7109203586326303, 0.8758073546581215)

#### Задание 3

In [18]:
model.feature_importances_

array([0.03041796, 0.00153287, 0.00705227, 0.00117158, 0.01463026,
       0.40245231, 0.01427184, 0.06457307, 0.00515197, 0.01203962,
       0.01819018, 0.01256251, 0.41595357])

In [19]:
model.feature_importances_.sum()

0.9999999999999991

In [20]:
imp_dict = {'feature_value': model.feature_importances_}
imp_df = pd.DataFrame(imp_dict, columns=['feature_value'], index=feature_names)

In [21]:
importantce_rank = imp_df.sort_values('feature_value', ascending=False)
importantce_rank

Unnamed: 0,feature_value
LSTAT,0.415954
RM,0.402452
DIS,0.064573
CRIM,0.030418
PTRATIO,0.01819
NOX,0.01463
AGE,0.014272
B,0.012563
TAX,0.01204
INDUS,0.007052


In [22]:
most_important = importantce_rank[0:2]
most_important

Unnamed: 0,feature_value
LSTAT,0.415954
RM,0.402452


In [23]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

#### 2мя наиболее весомыми факторами являются:
    LSTAT - percent of the lower status of the population (процент низшего уровня населения, 
        что бы это ни значило)
    RM - average number of rooms per dwelling (среднее кол-во комнат в жилище)