In [4]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Traceback (most recent call last):
  File "/opt/conda/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/opt/conda/lib/python3.7/site-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/opt/conda/lib/python3.7/site-packages/kaggle/api/kaggle_api_extended.py", line 166, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [5]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import os

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

print("Done")

Done


# Constants 👨‍🎓

In [6]:
PATH = "../input/house-prices-advanced-regression-techniques"
TRAIN_PATH = os.path.join(PATH, "train.csv")
TEST_PATH = os.path.join(PATH, "test.csv")
SUBMISSION_PATH = os.path.join(PATH, "sample_submission.csv")

SALE_PRICE = "SalePrice"
LOT_AREA = "LotArea"
OVERALL_QUAL = "OverallQual"
GRLIVAREA = "GrLivArea"
GARAGE_YR_BLT = "GarageYrBlt"
MAS_VNR_AREA = "MasVnrArea"	
TOTAL_BSMT_SF = "TotalBsmtSF"
GARAGE_CARS = "GarageCars"
GARAGE_AREA = "GarageArea"

# Functions 🧮

**🟦EN** Function that we use later to see the characteristics of the data such as missing values (NaN), all of features and number, records and columns

**🟥ES** Función que usamos más tarde para ver las características de los datos, como valores faltantes (NaN), todas las características y el número, registros y columnas

In [7]:
def data_description(df):
    print("Data description")
    print(f"Total number of records {df.shape[0]}")
    print(f'number of features {df.shape[1]}\n\n')
    columns = df.columns
    data_type = []
    
    # Get the datatype of features
    for col in df.columns:
        data_type.append(df[col].dtype)
        
    n_uni = df.nunique()
    # Number of NaN values
    n_miss = df.isna().sum()
    
    names = list(zip(columns, data_type, n_uni, n_miss))
    variable_desc = pd.DataFrame(names, columns=["Name","Type","Unique levels","Missing"])
    pd.set_option('display.max_rows', None)
    print(variable_desc)

In [None]:
def show_regplot(df, col1, col2, title = ""): 
    plt.figure(figsize = (10, 5))
    plt.title(title)
    sns.regplot(data = df, x = col1, y = col2, line_kws={"color": "green"})

In [None]:
def show_missing_values(df):
    nan_values = df.isnull().sum().sort_values(ascending = False)
    percent = (df.isnull().sum() / df.count() * 100)
    missing = pd.concat([nan_values, percent], keys = ["Nan values", "%"], axis = 1)
    return missing

# Data processing

In [8]:
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)
submission_data = pd.read_csv(SUBMISSION_PATH)

test_ID = test_data['Id']

In [None]:
train_data.head()

In [None]:
data_description(train_data)

In [None]:
test_data.head()

In [None]:
data_description(test_data)

In [None]:
plt.figure(figsize = (10,5))
sns.histplot(data = train_data, x = "SalePrice")

In [None]:
show_regplot(train_data, SALE_PRICE, LOT_AREA, "Relation LotArea with SalePrice")

In [None]:
correlation = train_data.corr()

k = 10
cols = correlation.nlargest(k, SALE_PRICE)[SALE_PRICE].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()


In [None]:
show_regplot(train_data, SALE_PRICE, OVERALL_QUAL, "Overall_Qual LotArea with SalePrice")

In [None]:
show_regplot(train_data, SALE_PRICE, GRLIVAREA, "GrLivArea LotArea with SalePrice")

In [None]:
index_corr = correlation[correlation[SALE_PRICE] > 0.4].index
print(index_corr)

In [None]:
train_data = train_data[index_corr]
index_corr = index_corr.drop(SALE_PRICE)
test_data = test_data[index_corr]
print("Done")

In [None]:
train_missing = show_missing_values(train_data)

In [None]:
train_missing.head(15)

In [None]:
test_missing = show_missing_values(test_data)
test_missing.head(15)

In [None]:
print(train_data[GARAGE_YR_BLT].dtype)
print(train_data[GARAGE_YR_BLT][:10])

In [None]:
train_data[GARAGE_YR_BLT] = train_data[GARAGE_YR_BLT].fillna(train_data[GARAGE_YR_BLT].mean())
test_data[GARAGE_YR_BLT] = test_data[GARAGE_YR_BLT].fillna(test_data[GARAGE_YR_BLT].mean())

train_data[MAS_VNR_AREA] = train_data[MAS_VNR_AREA].fillna(train_data[MAS_VNR_AREA].mean())
test_data[MAS_VNR_AREA] = test_data[MAS_VNR_AREA].fillna(test_data[MAS_VNR_AREA].mean())

In [None]:
test_data[TOTAL_BSMT_SF] = test_data[TOTAL_BSMT_SF].fillna(test_data[TOTAL_BSMT_SF].mean())
test_data[GARAGE_CARS] = test_data[GARAGE_CARS].fillna(test_data[GARAGE_CARS].mean())
test_data[GARAGE_AREA] = test_data[GARAGE_AREA].fillna(test_data[GARAGE_AREA].mean())
print(test_data.isna().sum())

# Train 🧠

In [None]:
print(train_data.columns)
print(train_data.shape)

In [None]:
X = train_data.copy()
X.drop(SALE_PRICE, axis = 1, inplace = True)
Y = train_data.SalePrice.values

print("Done")

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2)

print("Done")

In [None]:
param = {'n_estimators': 150, 'max_depth': 5, 'min_samples_split': 2,
          'learning_rate': 0.05, 'loss': 'ls'}

In [None]:
xg_reg = XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

gb_reg = GradientBoostingRegressor(**param)

rfr = RandomForestRegressor(n_estimators = 100, random_state = 0)

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.1, n_estimators=500,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
classifiers = [
    # ('DecisionTreeRegressor', dtreg),
    ('Gradient Boosting Regression', gb_reg),
    ('XGBRegressor', xg_reg),
    # ('Lasso',lasso),
    ('RandomForestRegressor', rfr),
    # ('Support Vector Machine Regression', svr),
    ('LightGBM',model_lgb)
]

In [None]:
for met_name, met in classifiers:
    met.fit(X_train, Y_train)
    
    pred = met.predict(X_val)
    pred = pred.reshape(-1,1)
    
    error = metrics.mean_squared_error(Y_val, pred)
    
    print('{:s} MSE : {:.3f}'.format(met_name, error))

# Submission 📩

In [None]:
lgb_pred = np.expm1(model_lgb.predict(test_data.values))

glb_pred = np.expm1(gb_reg.predict(test_data.values))

model_xgb_pred = np.expm1(xg_reg.predict(test_data.values))

In [None]:
ensemble = lgb_pred*0.70 + glb_pred*0.10 + model_xgb_pred*0.20
ensemble = pd.DataFrame(ensemble, columns=['SalePrice'])
ensemble.head()

In [None]:
test_ID = pd.DataFrame(test_ID, columns=['Id'])
result = pd.concat([test_ID, ensemble], axis=1)
result.head()

In [None]:
result.to_csv('submission.csv',index=False)