### Импорт
В этом проекте мы будем использовать стандартные библиотеки науки о данных и машинного обучения.

In [36]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)

# Matplotlib for visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Set default font size
plt.rcParams['font.size'] = 24

from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns
sns.set(font_scale = 2)

# Imputing missing values and scaling values
from sklearn.preprocessing import MinMaxScaler

from sklearn.impute import SimpleImputer


# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

### Считывание данных
Сначала давайте прочитаем отформатированные данные из предыдущей записной книжки.

In [37]:
train_features = pd.read_csv('data/features_train.csv')
test_features = pd.read_csv('data/features_test.csv')
target = pd.read_csv('data/target.csv')


# Display sizes of data
print('Training Feature Size: ', train_features.shape)
print('Testing Feature Size:  ', test_features.shape)
print('Training target Size:  ', target.shape)


Training Feature Size:  (1460, 234)
Testing Feature Size:   (1459, 234)
Training target Size:   (1460, 1)


In [38]:
train_features = train_features.drop(['SalePrice'], axis = 1)
test_features = test_features.drop(['SalePrice'], axis = 1)

In [39]:
test_features.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtHalfBath,BedroomAbvGr,KitchenAbvGr,Fireplaces,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,MSZoning_C (all),...,GarageFinish_RFn,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_Po,GarageCond_Fa,GarageCond_Gd,GarageCond_Po,PavedDrive_N,PavedDrive_P,PoolQC_Ex,PoolQC_Fa,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MiscFeature_Othr,MiscFeature_Shed,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,270.0,882.0,896,0,0,0.0,2,1,0,730.0,140,0,0,0,120,0,0,6,2010,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,406.0,1329.0,1329,0,0,0.0,3,1,0,312.0,393,36,0,0,0,0,12500,6,2010,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,137.0,928.0,928,701,0,0.0,3,1,1,482.0,212,34,0,0,0,0,0,3,2010,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,324.0,926.0,926,678,0,0.0,3,1,1,470.0,360,36,0,0,0,0,0,6,2010,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,1017.0,1280.0,1280,0,0,0.0,2,1,0,506.0,0,82,0,0,144,0,0,1,2010,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
target.tail()

Unnamed: 0,SalePrice
1455,175000
1456,210000
1457,266500
1458,142125
1459,147500


In [41]:
dataset_features =  pd.concat(objs=[train_features, test_features], axis=0).reset_index(drop=True)

In [42]:
# функция поиска недостающих данных
def find_missing_data(data):
    Total = data.isnull().sum().sort_values(ascending = False)
    Percentage = (data.isnull().sum()/data.isnull().count()).sort_values(ascending = False)
    
    return pd.concat([Total, Percentage] , axis = 1 , keys = ['Total' , 'Percent'])

In [43]:
find_missing_data(dataset_features).head(10)

Unnamed: 0,Total,Percent
LotFrontage,486,0.166495
MasVnrArea,23,0.007879
BsmtHalfBath,2,0.000685
BsmtUnfSF,1,0.000343
GarageArea,1,0.000343
BsmtFinSF1,1,0.000343
BsmtFinSF2,1,0.000343
TotalBsmtSF,1,0.000343
BsmtExposure_Mn,0,0.0
BsmtFinType1_GLQ,0,0.0


## Замена Пропущенных Значений

In [44]:
# Create an imputer object with a median filling strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Train on the training features
imputer.fit(train_features)

# Transform both training data and testing data
X = imputer.transform(train_features)
X_test = imputer.transform(test_features)

In [45]:
print('Missing values in training features: ', np.sum(np.isnan(X)))
print('Missing values in testing features:  ', np.sum(np.isnan(X_test)))

Missing values in training features:  0
Missing values in testing features:   0


##  нормализация данных

In [46]:
# Create the scaler object with a range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
scaler.fit(X)

# Transform both the training and testing data
X = scaler.transform(X)
X_test0 = scaler.transform(X_test)

In [47]:
from sklearn.model_selection import train_test_split

# Split into 70% training and 30% testing set
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = 0.3, random_state = 42)

In [48]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1022, 233)
(438, 233)
(1022, 1)
(438, 1)


In [49]:
# Convert y to one-dimensional array (vector) Преобразовать y в одномерный массив (вектор)
y_train = np.array(y_train).reshape((-1, ))
y_test = np.array(y_test).reshape((-1, ))


In [50]:
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

In [51]:
baseline_guess = np.median(y_train)

print('The baseline guess is a score of %0.2f' % baseline_guess)
print("Baseline Performance on the test set: MAE = %0.4f" % mae(y_test, baseline_guess))
print('min', y_train.min(),'max',y_train.max())


The baseline guess is a score of 165000.00
Baseline Performance on the test set: MAE = 57047.0046
min 34900 max 745000


##  Models to Evaluate

In [52]:
from hyperopt import hp
import numpy as np
from sklearn.metrics import mean_squared_error


# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.01, 0.2, 0.01)),
    'max_depth':        hp.choice('max_depth',        np.arange(1, 30, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 15, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.1, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.2, 1),
    'n_estimators':     hp.choice('n_estimators', np.arange(100, 3000, 50))
    }

xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 10,
    'verbose': False
    }

xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


import xgboost as xgb

from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials


class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

  
    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK}

    
    def write_submission(preds, output):
        sample = pd.read_csv('../data/sampleSubmission.csv')
        preds = pd.DataFrame(
            preds, index=sample.id.values, columns=sample.columns[1:])
        preds.to_csv(output, index_label='id')    


obj = HPOpt(X_train, X_test, y_train, y_test)

xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=10)    

100%|█████████████████████████████████████████████████████████████| 10/10 [00:16<00:00,  1.60s/trial, best loss: 25384.027335475977]


In [53]:
print('xgb_opt:')
print(xgb_opt)

xgb_opt:
({'colsample_bytree': 6, 'learning_rate': 3, 'max_depth': 26, 'min_child_weight': 3, 'n_estimators': 28, 'subsample': 0.6334765503816718}, <hyperopt.base.Trials object at 0x7f725372d1c0>)


In [54]:
xgbr = xgb.XGBRegressor(cosample_bytree =2 , 
         learning_rate = 0, 
         max_depth =7,
         min_child_weight = 3, 
         n_estimators = 38,
         subsample =0.7719775358598138)

xgbr.fit(X_train, y_train)

klj=xgbr.predict(X_test0)
klj

Parameters: { "cosample_bytree" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5], dtype=float32)

In [56]:
y_train

array([174000, 145000, 215200, ..., 115000, 189950, 174000])

In [55]:
xgb_model_run(X_train, X_test, y_train, y_test, X_test0, xgb_opt)

NameError: name 'xgb_model_run' is not defined

In [None]:
ID = pd.read_csv('data/test.csv')
ID = ID.Id
ID

In [None]:
obj.write_submission()