In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# load the train and test datasets
url = "https://drive.google.com/file/d/1I4i1QbDDPm0F1kKpPUWeZDzRT83JqpfI/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
train_df = df = pd.read_csv(path)

url = "https://drive.google.com/file/d/1AVS69qeJQmAjXO7a73DzQwyKBzMkcCHW/view?usp=sharing"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
test_df = df = pd.read_csv(path)


In [None]:
train_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [None]:
# Separate the target variable
y_train = train_df['SalePrice'].values
train_df.drop('SalePrice', axis=1, inplace=True)

# Combine the train and test data for preprocessing
combined_df = pd.concat([train_df, test_df])

# Define a function for feature engineering
def engineer_features(df):
    # Create a new feature 'TotalSF'
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    
    # Create a new feature 'TotalBathrooms'
    df['TotalBathrooms'] = df['FullBath'] + df['HalfBath'] + df['BsmtFullBath'] + df['BsmtHalfBath']
    
    # Create a new feature 'Age'
    df['Age'] = df['YrSold'] - df['YearBuilt']
    
    return df

combined_df = engineer_features(combined_df)

# One-hot encode the categorical data
combined_df = pd.get_dummies(combined_df)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
combined_df = imputer.fit_transform(combined_df)

# Split the data back into train and test sets
X_train = combined_df[:len(train_df)]
X_test = combined_df[len(train_df):]

# Define the preprocessing steps
preprocessor = make_column_transformer(
    (StandardScaler(), slice(0, 41)),
    remainder='passthrough'
)

# Define the models
models = [
    make_pipeline(preprocessor, Ridge(alpha=10)),
    make_pipeline(preprocessor, RandomForestRegressor(n_estimators=100, max_depth=20, random_state=0))
]

# Evaluate models using cross-validation
for model in models:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print(f'{model.steps[-1][0]} RMSE: {-np.mean(np.sqrt(-scores)):.2f}')

# Choose the best model
best_model = models[1]

# Fit the best model and make predictions
best_model.fit(X_train, y_train)
predictions = best_model.predict(X_test)

# Save the predictions
output_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': predictions})
output_df.to_csv('submission.csv', index=False)

ridge RMSE: -31164.72
randomforestregressor RMSE: -29787.20
