In [1]:
import pandas as pd

train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

train.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
# ============================
# 1. IMPORT LIBRARIES
# ============================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor


# ============================
# 2. LOAD DATA
# ============================
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# Save test IDs
test_ID = test["Id"]

# ============================
# 3. SEPARATE TARGET VARIABLE
# ============================
y = train["SalePrice"]
train = train.drop(["SalePrice"], axis=1)


# ============================
# 4. HANDLE MISSING VALUES
# (train & test handled separately)
# ============================
# Numeric columns
train_num = train.select_dtypes(include=['int64', 'float64']).columns
test_num = test.select_dtypes(include=['int64', 'float64']).columns

for col in train_num:
    train[col] = train[col].fillna(train[col].median())

for col in test_num:
    test[col] = test[col].fillna(test[col].median())

# Categorical columns
train_cat = train.select_dtypes(include=['object']).columns
test_cat = test.select_dtypes(include=['object']).columns

for col in train_cat:
    train[col] = train[col].fillna(train[col].mode()[0])

for col in test_cat:
    test[col] = test[col].fillna(test[col].mode()[0])


# ============================
# 5. ONE HOT ENCODING (Same for train & test)
# ============================
cat_cols = train.select_dtypes(include=['object']).columns
num_cols = train.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

# ============================
# 6. SPLIT TRAIN FOR VALIDATION
# ============================
X_train, X_val, y_train, y_val = train_test_split(
    train, y, test_size=0.2, random_state=42
)

# ============================
# 7. BUILD PIPELINE WITH XGBOOST
# ============================
model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("xgb", XGBRegressor(
        n_estimators=700,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

# ============================
# 8. TRAIN MODEL
# ============================
model.fit(X_train, y_train)

# ============================
# 9. PREDICT ON TEST DATA
# ============================
final_predictions = model.predict(test)

# ============================
# 10. SAVE SUBMISSION
# ============================
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": final_predictions
})

submission.to_csv("submission.csv", index=False)

print("✔ submission.csv created successfully!")


✔ submission.csv created successfully!
