In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_set = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
train_set.head()

In [None]:
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

In [None]:
X = train_set.drop(['SalePrice'], axis = 1)#
y = train_set['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
y_train.describe()

In [None]:
print(X_train.shape, '\n')
X_train.info()

In [None]:
X = train_set.drop(['SalePrice'], axis = 1)#
y = train_set['SalePrice']

X.loc[:, 'TotalArea']= X.loc[:, 'LotFrontage'] + X.loc[:, 'LotArea']
X.loc[:, "totalFlrSF"] = X.loc[:, "1stFlrSF"] + X.loc[:, "2ndFlrSF"]

X_train, X_val, y_train, y_val = train_test_split(X,y, train_size=0.8, test_size=0.2, random_state=0)

# categorial columns: dtype == object or number of unique values < 10
categorical_cols = [colname for colname in X_train.columns if X_train[colname].nunique() < 10 and X_train[colname].dtype == "object"]

# numerical columns: dtype == int64 or float64 (although we saw in the X_train.info() there were only int64 in the dataset)
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

numerical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('scaling', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='infrequent_if_exist'))
])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer, categorical_cols)])

lgbm_reg = LGBMRegressor(num_leaves = 10, max_depth = 10, learning_rate = 0.1)

X_train[categorical_cols] = X_train[categorical_cols].fillna("None")

pipe = Pipeline(steps=[("preprocessor", preprocessor), ("regressor", lgbm_reg)])
pipe.fit(X_train, np.log(y_train))

y_pred = pipe.predict(X_val)
score = np.sqrt(mean_squared_error(np.log(y_val), y_pred))
print(f'RMSE Score on Test set: {score}')

teste = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

teste[categorical_cols] = teste[categorical_cols].fillna("None")

teste.loc[:, 'TotalArea']= teste.loc[:, 'LotFrontage'] + teste.loc[:, 'LotArea']
teste.loc[:, "totalFlrSF"] = teste.loc[:, "1stFlrSF"] + teste.loc[:, "2ndFlrSF"]

teste[categorical_cols] = teste[categorical_cols].fillna("None")

p = pipe.predict(teste)

d = {'id': teste['Id'], 'SalePrice': np.exp(p)} 

predictions = pd.DataFrame(d)
predictions.to_csv("/kaggle/working/submission.csv", index=False) # Score: 0.13178 in the leaderboard 