In [286]:
import numpy as np
import pandas as pd


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


def drop_outlier(df, outliers):
    for name, outlier in outliers.items():
        df = df.drop(df[df[name] > outlier].index)
    return df


def process_features(train, test):
    features = pd.concat([train, test])

    # Remove Alley, FireplaceQu, PoolQC, Fence, MiscFeature, Utilities
    features = features.loc[:, train.isnull().mean() < 0.2]
    features.drop(columns=['Utilities'], inplace=True)
    
    features['MSSubClass'] = features['MSSubClass'].apply(str)

    fill_in_mode = [
        'Exterior1st', 'Exterior2nd', 'Electrical', 'KitchenQual', 'Functional',
        'SaleType',
    ]
    fill_in_none = [
        'MSZoning', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
        'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual',
        'GarageCond',
    ]
    fill_in_zero = [
        'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
        'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea',
    ]
    fill_in_mean = [
        'LotFrontage',
    ]
    
    for column in fill_in_mode:
        features[column].fillna(
            features[column].mode(),
            inplace=True)

    for column in fill_in_none:
        features[column].fillna(
            'None',
            inplace=True)

    for column in fill_in_zero:
        features[column].fillna(
            0,
            inplace=True)

    for column in fill_in_mean:
        features[column].fillna(
            features[column].mean(),
            inplace=True)
        
    for column in features.select_dtypes(exclude='object').columns:
        features[column] = np.log(features[column] + 1)

    return pd.get_dummies(features)


def process_dataframes(train, test):
    train_ = drop_outlier(
        train,
        {
            'LotFrontage': 300,
            'LotArea': 100000,
            'BsmtFinSF1': 5000,
            'TotalBsmtSF': 6000,
            '1stFlrSF': 4000,
            'GrLivArea': 4000,
        })

    features = process_features(
        train_.drop(columns=['Id', 'SalePrice']),
        test.drop(columns=['Id']))

    train_features = features.iloc[:train_.shape[0], :]
    train_target = train_['SalePrice']
    test_features = features.iloc[train_.shape[0]:, :]
    
    return train_features, train_target, test_features


train_features, train_target, test_features = process_dataframes(train, test)


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor


x_train, x_test, y_train, y_test = train_test_split(
    train_features,
    train_target,
    test_size = 0.8,
    random_state = 73)

models = [
    Lasso(alpha=0.001, max_iter=1000),
    XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=600, subsample=0.6, colsample_bytree=1),
    RandomForestRegressor(n_estimators=800, max_features='sqrt', oob_score=True),
    GradientBoostingRegressor(learning_rate=0.03, n_estimators=800, max_features='sqrt'),
]

predicted_targets = np.array([
    model.fit(train_features, np.log(train_target)).predict(test_features)
    for model in models
])
avg_predicted_target = [
    sum([
        predicted_targets[model_index][target_index]
        for model_index in range(len(models))
    ]) / len(models)
    for target_index in range(test.shape[0])
]

output = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': np.exp(avg_predicted_target)
})

output.to_csv('submission.csv', index=False)