In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML

plt.style.use("ggplot")

# Import the Data

In [None]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train.columns

In [None]:
train.head()

In [None]:
train.info()

# Many missing data in these columns.

In [None]:
train.isna().sum().sort_values(ascending = False)[:6].index

In [None]:
train = train.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage'], axis=1)

# Continuous Variables.

In [None]:
continuos_variable = train.columns[train.nunique() > 20]
continuos_variable = train[continuos_variable]

In [None]:
continuos_variable.describe().T

# Categorical Data

In [None]:
Categorical = train.columns[train.nunique() <= 15]
Categorical = train[Categorical]

In [None]:
Categorical.describe().T

# Exploratory Data Analysis

In [None]:
corrmat = train.corr().nlargest(20, "SalePrice")
corrmat = corrmat.T.sort_values("SalePrice", ascending = False)[:20]
mask = np.triu(np.ones_like(corrmat))
plt.figure(figsize = (10, 10))
sns.heatmap(data = corrmat, mask = mask, annot = True, square = True, cmap = "Blues", vmax = .8, annot_kws = {"fontsize" : 8})

In [None]:
categoricals = train.columns[train.nunique() <= 10]

for column in categoricals:
    print(f"Feature name: {column}\n")
    print(train[column].value_counts())
    print("")
    print("="*20)

In [None]:
cols = train.columns[train.nunique() > 20]
cols = cols.drop("Neighborhood")

fig, axes = plt.subplots(6, 4, figsize = (20, 20))

for i, name in enumerate(cols):
    
    r, c = i//4, i%4
    sns.boxplot(data = train[name], ax = axes[r, c])
    axes[r, c].set_title(name)

In [None]:
yb = train.groupby(["YearBuilt" ,"OverallQual"]).agg({"SalePrice" : "mean"})
yb.reset_index(inplace = True)
plt.figure(figsize = (30, 30))
sns.scatterplot(data = yb, x = "YearBuilt", y = "SalePrice", hue = "OverallQual", size="OverallQual", palette = "Blues")
plt.xticks(rotation = 90)

plt.show()

In [None]:
plt.figure(figsize=(20, 6))
sns.countplot(data=train, x="OverallQual", color = "skyblue")
plt.xlabel("OverallQual")
plt.ylabel("Count")

plt.show()

In [None]:
plt.figure(figsize = (15, 10))
sns.histplot(train.SalePrice, kde = True)

In [None]:
plt.figure(figsize=(15, 10))
bins = [0, 100000, 200000,300000, train['SalePrice'].max()]
labels = ['0-100k', '100k-200k','200k-300k', '300k+']
train['price_range'] = pd.cut(train['SalePrice'], bins=bins, labels=labels)

plt.pie(

train['price_range'].value_counts().sort_values(),
autopct='%.1f%%',
labels=labels,
labeldistance=1.15,
wedgeprops = { 'linewidth' : 3, 'edgecolor' : 'white' }

)

In [None]:
train = train[train["SalePrice"] < 350000]
train.shape

In [None]:
plt.figure(figsize=(15, 10))
bins = [0, 100000, 200000, 300000, train['SalePrice'].max()]
labels = ['0-100k', '100k-200k', '200k-300k', '300k+']
train['price_range'] = pd.cut(train['SalePrice'], bins=bins, labels=labels)

plt.pie(
    train['price_range'].value_counts().sort_values(),
    autopct='%.1f%%',
    labels=labels,
    labeldistance=1.15,
    wedgeprops={'linewidth': 3, 'edgecolor': 'white'},
    textprops={'color': 'black'} 
)

plt.show()


# Mean SalePrice by Features

In [None]:
def plot_mean_saleprice(column_name):
    groupby_column = train.groupby(column_name).agg({"SalePrice": "mean"})

    plt.figure(figsize=(20, 8))
    sns.barplot(data=groupby_column, x=groupby_column.index, y="SalePrice", color='blue')

    max_saleprice_category = groupby_column['SalePrice'].idxmax()

    index = groupby_column.index.get_loc(max_saleprice_category)
    plt.gca().patches[index].set_facecolor('red')

    plt.xlabel(column_name)
    plt.ylabel('SalePrice')
    plt.title(f'Mean SalePrice by {column_name}')
    if column_name == "YearBuilt":
        plt.xticks(rotation=90)
    

    plt.show()


In [None]:
import matplotlib.pyplot as plt

ap = ["OverallQual", "YearBuilt", "GarageCars", "MoSold"]

plt.figure(figsize=(10, 6))

for column in ap:
    plot_mean_saleprice(column)

plt.tight_layout()
plt.show()


# Mean SalePrice by Time Series Data

In [None]:
import numpy as np

pt = train.groupby("YearRemodAdd").agg({"SalePrice": "mean"})
max_price = pt["SalePrice"].max()
max_year = pt.loc[pt["SalePrice"] == max_price].index[0]

# Calcular la línea de tendencia
coefficients = np.polyfit(pt.index, pt["SalePrice"], 1)
trendline = np.polyval(coefficients, pt.index)

plt.figure(figsize=(10, 5))
plt.plot(pt.index, pt["SalePrice"])
plt.scatter(max_year, max_price, color="blue", label="Highest Price")
plt.plot(pt.index, trendline, color="blue", label="Trendline")
plt.legend()



In [None]:
import statsmodels.api as sm

def plot_with_autocorrelation(column):
    # Gráfico de promedio de venta por año
    pt = train.groupby(column).agg({"SalePrice": "mean"})
    max_price = pt["SalePrice"].max()
    max_value_index = pt.loc[pt["SalePrice"] == max_price].index[0]


    autocorrelation = sm.tsa.stattools.acf(pt["SalePrice"])
    partial_autocorrelation = sm.tsa.stattools.pacf(pt["SalePrice"])

    # Configurar el tamaño y diseño de los subplots
    fig, axes = plt.subplots(3, figsize=(15, 8))

    axes[0].plot(pt.index, pt["SalePrice"])
    axes[0].scatter(max_value_index, max_price, color="blue", label="Highest Price")
    axes[0].legend()
    axes[0].set_xlabel(column)
    axes[0].set_ylabel("Sale Price")
    axes[0].set_title("Average Sale Price by " + column)

    sm.graphics.tsa.plot_acf(pt["SalePrice"], ax=axes[1])
    axes[1].set_xlabel("Lag")
    axes[1].set_ylabel("Autocorrelation")
    axes[1].set_title("Autocorrelation")

    sm.graphics.tsa.plot_pacf(pt["SalePrice"], ax=axes[2])
    axes[2].set_xlabel("Lag")
    axes[2].set_ylabel("Partial Autocorrelation")
    axes[2].set_title("Partial Autocorrelation")

    plt.tight_layout()
    plt.show()


In [None]:
plot_with_autocorrelation("YearBuilt")

In [None]:
plot_with_autocorrelation("YearRemodAdd")

In [None]:
train["YrSold"].value_counts()

In [None]:
train["MoSold"].value_counts()

In [None]:
train[["YrSold", "MoSold"]]

In [None]:
SalesXMonth = train.groupby(["YrSold", "MoSold"]).agg({"SalePrice" : "mean"})
SalesXMonth['DateSale'] = pd.to_datetime(SalesXMonth.index.map(lambda x: f'{x[0]}-{x[1]}-01'))
SalesXMonth = SalesXMonth.reset_index(drop=True)


In [None]:
max_index = SalesXMonth['SalePrice'].idxmax()
max_date = SalesXMonth.loc[max_index, 'DateSale']
max_price = SalesXMonth.loc[max_index, 'SalePrice']
fig, axes = plt.subplots(3, 1, figsize=(15, 24))

axes[0].plot(SalesXMonth['DateSale'], SalesXMonth['SalePrice'])
axes[0].scatter(max_date, max_price, color='blue', label='Highest Price')
axes[0].set_xlabel('Sale Date')
axes[0].set_ylabel('Average Sale Price')
axes[0].set_title('Average Sale Price by Sale Date')
axes[0].tick_params(axis='x', rotation=45)
axes[0].legend()

sm.graphics.tsa.plot_acf(SalesXMonth['SalePrice'], lags=12, ax=axes[1])
axes[1].set_xlabel('Lag')
axes[1].set_ylabel('Autocorrelation')
axes[1].set_title('Autocorrelation of Average Sale Price')
axes[1].tick_params(axis='x', rotation=45)

sm.graphics.tsa.plot_pacf(SalesXMonth['SalePrice'], lags=12, ax=axes[2])
axes[2].set_xlabel('Lag')
axes[2].set_ylabel('Partial Autocorrelation')
axes[2].set_title('Partial Autocorrelation of Average Sale Price')
axes[2].tick_params(axis='x', rotation=45)

# Ajustar la disposición de los subplots y mostrar la figura
plt.tight_layout()
plt.show()



# Feature Engineering

In [None]:
train["GarageVolume"] = train["GarageCars"] * train["GarageArea"]
train["GarageAreaRatio"] = train["GarageArea"] / train["GrLivArea"]

In [None]:
train["ExterQual"].value_counts()


In [None]:
# Reemplazar los valores en la columna "ExterQual"
train.replace({"ExterQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1}}, inplace=True)


In [None]:
# Agrupar los datos por calidad general de la vivienda (OverallQual) y calcular la calidad promedio del material exterior (ExterQual)
quality_data = train.groupby("OverallQual")["ExterQual"].mean()

# Crear el gráfico de barras
plt.bar(quality_data.index, quality_data.values)
plt.xlabel("OverallQual")
plt.ylabel("Average ExterQual")
plt.title("Relationship between OverallQual and ExterQual")
plt.show()


In [None]:
train["OverallQual"] = train["OverallQual"] / train["OverallQual"].max()

train["ExterQual"] = (train["ExterQual"] - train["ExterQual"].min()) / (train["ExterQual"].max() - train["ExterQual"].min())

train["TotalQuall"] = (train["OverallQual"] + train["ExterQual"]) / 2


In [None]:
train.replace({"KitchenQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1}}, inplace=True)

plot_mean_saleprice("KitchenQual")

In [None]:
train.replace({"HeatingQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po" : 0}}, inplace=True)

plot_mean_saleprice("HeatingQC")

In [None]:
train.replace({"GarageQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po" : 0}}, inplace=True)

plot_mean_saleprice("GarageQual")

In [None]:
train.replace({"CentralAir": {"Y" : 1, "N" : 0}}, inplace=True)

plot_mean_saleprice("CentralAir")

In [None]:
corrmat = train.corr().nlargest(20, "SalePrice")
corrmat = corrmat.T.sort_values("SalePrice", ascending = False)[:20]
mask = np.triu(np.ones_like(corrmat))
plt.figure(figsize = (10, 10))
sns.heatmap(data = corrmat, mask = mask, annot = True, square = True, cmap = "YlGnBu", annot_kws = {"fontsize" : 7})

# Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class Dataset:
    def __init__(self, df, is_train=True):
        self.df = df
        self.is_train = is_train

    def preprocess_data(self):
        processed_df = self.df.copy()
        
        if self.is_train:
            processed_df = processed_df[processed_df["SalePrice"] < 350000]
            processed_df = processed_df[processed_df["GarageArea"]  <= 1000 ]
            processed_df = processed_df[processed_df["TotalBsmtSF"]  <= 3000 ]
            processed_df = processed_df[processed_df["GrLivArea"]  <= 3000 ]
            processed_df = processed_df[processed_df["LotArea"]  <= 50000 ]
        # Apply preprocessing steps
        processed_df = processed_df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage', "Id"], axis=1)
        processed_df["GarageVolume"] = processed_df["GarageCars"] * processed_df["GarageArea"]
        processed_df["GarageAreaRatio"] = processed_df["GarageArea"] / processed_df["GrLivArea"]
        processed_df.replace({"ExterQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1}}, inplace=True)
        processed_df["OverallQual"] = processed_df["OverallQual"] / processed_df["OverallQual"].max()
        processed_df["ExterQual"] = (processed_df["ExterQual"] - processed_df["ExterQual"].min()) / (processed_df["ExterQual"].max() - processed_df["ExterQual"].min())
        processed_df["TotalQuall"] = (processed_df["OverallQual"] + processed_df["ExterQual"]) / 2
        processed_df.replace({"HeatingQC": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po" : 0}}, inplace=True)
        processed_df.replace({"GarageQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1, "Po" : 0}}, inplace=True)
        processed_df.replace({"KitchenQual": {"Ex": 4, "Gd": 3, "TA": 2, "Fa": 1}}, inplace=True)
        processed_df.replace({"CentralAir": {"Y" : 1, "N" : 0}}, inplace=True)

        categorical_cols = processed_df.select_dtypes(include=['object']).columns
        label_encoder = LabelEncoder()
        processed_df[categorical_cols] = processed_df[categorical_cols].apply(label_encoder.fit_transform)
        
        numeric_cols = processed_df.select_dtypes(include=['float64', 'int64']).columns
        imputer = SimpleImputer(strategy='median')
        processed_df[numeric_cols] = imputer.fit_transform(processed_df[numeric_cols])
        
        return processed_df
    def get_train_test_data(self):
        processed_df = self.preprocess_data()

        X = processed_df

        if self.is_train:
            X = X.drop('SalePrice', axis=1)
            y = processed_df['SalePrice']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            return X_train, X_test, y_train, y_test
        else:
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

            return X


In [None]:
df_train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
dataset_train = Dataset(df_train, is_train=True)
X_train, X_test, y_train, y_test = dataset_train.get_train_test_data()

print("Dimensiones de X_train:", X_train.shape)
print("Dimensiones de X_test:", X_test.shape)
print("Dimensiones de y_train:", y_train.shape)
print("Dimensiones de y_test:", y_test.shape)


# XGBoost Regressor

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Train the XGB model

def xgb_train(X_train,X_test):
    model_params = {
    'objective': 'reg:squarederror',
    'boosting_type': 'gbtree',
    'learning_rate': 0.1,
    'n_estimators': 500,
    'max_depth': 3,
#     'min_child_weight': 1,
#     'subsample': 0.8,
#     'colsample_bytree': 0.8,
    'verbosity': 0,
    'early_stopping_rounds':100
    }
    xgb_model = XGBRegressor(**model_params)
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],verbose=0)

    # Predict on the validation set
    y_pred = xgb_model.predict(X_test)

    # Evaluate the model using mean squared error
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error: ", rmse)
    return xgb_model

xgb_model = xgb_train(X_train,X_test)
    

In [None]:
df_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
submission = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")

In [None]:
dataset_test = Dataset(df_test, is_train=False)
X_test = dataset_test.get_train_test_data()

print("Dimensiones de X_test:", X_test.shape)


In [None]:
baseline_test_pred = xgb_model.predict(X_test)