In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
test_data.head()

## Data Exploration

In [None]:
from matplotlib import pyplot as plt

num_feats = train_data.select_dtypes(include=['int64', 'float64']).columns

train_data[num_feats].hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
# Visualizing outliers with Box Whiskers Plot
import seaborn as sns

plt.figure(figsize=(15, len(num_feats) * 2.5))

for i, col in enumerate(num_feats, 1):
    plt.subplot(len(num_feats), 1, i)
    sns.boxplot(x=train_data[col])
    plt.title(f'Boxplot of {col}', fontsize=12)
    plt.tight_layout()

plt.show()

## Data Cleaning and Preparation

In [None]:
cat_feats = train_data.select_dtypes(include='object').columns
train_data[cat_feats].nunique().sort_values()

In [None]:
num_feats = train_data.select_dtypes(include='number')
num_feats.isna().sum()[num_feats.isna().sum() > 0]

In [None]:
mean_value = train_data.loc[train_data['MasVnrType'].notna(), 'MasVnrArea'].mean()

both_nan = train_data['MasVnrType'].isna() & train_data['MasVnrArea'].isna()
train_data.loc[both_nan, 'MasVnrArea'] = 0

num_only_nan = train_data['MasVnrType'].notna() & train_data['MasVnrArea'].isna()
train_data.loc[num_only_nan, 'MasVnrArea'] = mean_value

cat_only_nan = train_data['MasVnrType'].isna() & train_data['MasVnrArea'].notna()
train_data.loc[num_only_nan, 'MasVnrType'] = 0

In [None]:
train_data['HasVnr'] = (train_data['MasVnrArea'] != 0)

In [None]:
train_data[train_data['HasVnr'] == False]

In [None]:
mean_value = train_data.loc[train_data['GarageType'].notna(), 'GarageYrBlt'].mean()

both_nan = train_data['GarageType'].isna() & train_data['GarageYrBlt'].isna()
train_data.loc[both_nan, 'GarageYrBlt'] = 0

num_only_nan = train_data['GarageType'].notna() & train_data['GarageYrBlt'].isna()
train_data.loc[num_only_nan, 'GarageYrBlt'] = mean_value

In [None]:
train_data['HasGarage'] = train_data['GarageYrBlt'] != 0

In [None]:
train_data.loc[train_data['LotFrontage'].isna(), 'LotFrontage'] = 0 

In [None]:
one_hot_enc_feats = [col for col in cat_feats if train_data[col].nunique() <= 10]
print(train_data[one_hot_enc_feats].nunique().sort_values())

In [None]:
target_enc_feats = [col for col in cat_feats if train_data[col].nunique() > 10]
print(train_data[target_enc_feats].nunique().sort_values())

In [None]:
train_data_enc = pd.get_dummies(train_data, columns=one_hot_enc_feats)

In [None]:
X_train = train_data_enc.drop('SalePrice', axis=1)
y_train = train_data_enc['SalePrice']

In [None]:
from category_encoders import TargetEncoder

target_encoder = TargetEncoder()
X_train[target_enc_feats] = target_encoder.fit_transform(X_train[target_enc_feats], y_train)

In [None]:
X_train['Exterior1st']

In [None]:
X_train.isna().sum()[X_train.isna().sum() > 0]

In [None]:
X_train['HasGarage'] = X_train['HasGarage'].astype(int)
X_train['HasVnr'] = X_train['HasVnr'].astype(int)

In [None]:
X_train.select_dtypes(exclude=['number', 'bool']).columns

In [None]:
import xgboost as xgb

model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
xgb.plot_importance(model)
plt.show()

In [None]:
cat_feats = test_data.select_dtypes(include='object').columns
test_data[cat_feats].nunique().sort_values()

In [None]:
num_feats = train_data.select_dtypes(include='number')
num_feats.isna().sum()[num_feats.isna().sum() > 0]

In [None]:
one_hot_enc_feats = [col for col in cat_feats if test_data[col].nunique() <= 10]
print(test_data[one_hot_enc_feats].nunique().sort_values())

In [None]:
target_enc_feats = [col for col in cat_feats if test_data[col].nunique() > 10]
print(test_data[target_enc_feats].nunique().sort_values())

In [None]:
test_data_enc = pd.get_dummies(test_data, columns=one_hot_enc_feats)

In [None]:
X_test = test_data_enc

In [None]:
X_test[target_enc_feats] = target_encoder.transform(X_test[target_enc_feats])

In [None]:
X_test.select_dtypes(exclude=['number', 'bool']).columns

In [None]:
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0

X_test = X_test[X_train.columns]
y_pred = model.predict(X_test)

In [None]:
submission = pd.DataFrame({
    "Id": X_test["Id"],
    "SalePrice": y_pred
})

submission.to_csv("submission.csv", index=False)