In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [None]:
train_data = pd.read_csv('/Users/konstantin/MachineLearning/task4/data/train.csv')
test_data = pd.read_csv('/Users/konstantin/MachineLearning/task4/data/test.csv')
train_data = train_data.drop(columns=["Id"])
test_data = test_data.drop(columns=["Id"])
for feature in train_data.columns:
    percent = train_data[feature].isnull().sum() / train_data.shape[0]
    if percent > 0.7:
        train_data = train_data.drop(columns=feature)
        test_data = test_data.drop(columns=feature)
for feature in train_data.columns[:-1]:
    if train_data[feature].dtype == 'object':
        train_data[feature] = LabelEncoder().fit_transform(train_data[feature])
        test_data[feature] = LabelEncoder().fit_transform(test_data[feature])
# split
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns='SalePrice').values,
                                                    np.log1p(train_data['SalePrice'].values), test_size=0.2,
                                                    random_state=98987)
X_train = SimpleImputer(strategy='most_frequent').fit_transform(X_train)
X_test = SimpleImputer(strategy='most_frequent').fit_transform(X_test)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# Random Forest Regressor

In [3]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'criterion': ['squared_error', 'absolute_error', 'poisson'],
    'n_estimators': [10, 50, 75, 100],
    'max_features': [1.0, 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 9],
    'max_depth': [10, 50, 100, 150], }
rfr_model = GridSearchCV(RandomForestRegressor(), parameters)
rfr_model.fit(X_train, y_train)
print("Best parameters for RFR is: {}".format(rfr_model.best_params_))

Best parameters for RFR is: {'criterion': 'squared_error', 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 75}


In [4]:
model = RandomForestRegressor(**rfr_model.best_params_)
model.fit(X_train, y_train)
print('Abs error on train data: %1.2f'%metrics.mean_absolute_error(rfr_model.predict(X_train), y_train))
print('Abs error on test data %1.2f:'%metrics.mean_absolute_error(rfr_model.predict(X_test), y_test))

Abs error on train data: 0.04
Abs error on test data 0.11:


# XGBoost Regressor

In [5]:
import xgboost as xgb


parameters = {
    "learning_rate": [0.1, 0.2, 0.3],
    "max_depth": [2, 3, 4, 5, 6],
    "min_child_weight": [1, 2, 3, 4, 5],
    "n_estimators": [100, 300, 600, 1000]}
xgb_model = GridSearchCV(xgb.XGBRegressor(), parameters)
xgb_model.fit(X_train, y_train)
print("Best parameters for XGB is: {}".format(xgb_model.best_params_))

Best parameters for XGB is: {'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 300}


In [6]:
xgb_model = xgb.XGBRegressor(**xgb_model.best_params_)
xgb_model.fit(X_train, y_train)
print('Abs error on train data: %1.2f'%metrics.mean_absolute_error(xgb_model.predict(X_train), y_train))
print('Abs error on test data %1.2f:'%metrics.mean_absolute_error(xgb_model.predict(X_test), y_test))

Abs error on train data: 0.04
Abs error on test data 0.09:
