In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [13]:
# Kaggle training data
path = "kaggle_train.csv"
train = pd.read_csv(path)

In [14]:
# Kaggle test/validation data
path = "kaggle_test.csv"
val_data_full = pd.read_csv(path)

In [15]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Remove the Id column from both datasets

In [16]:
train = train.drop("Id", axis=1)
val_data = val_data_full.drop("Id", axis=1)

In [17]:
X = train.drop("SalePrice", axis=1)
y = train["SalePrice"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Add two pipelines, categorical and numerical

In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 618 to 684
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1168 non-null   int64  
 1   MSZoning       1168 non-null   object 
 2   LotFrontage    956 non-null    float64
 3   LotArea        1168 non-null   int64  
 4   Street         1168 non-null   object 
 5   Alley          71 non-null     object 
 6   LotShape       1168 non-null   object 
 7   LandContour    1168 non-null   object 
 8   Utilities      1168 non-null   object 
 9   LotConfig      1168 non-null   object 
 10  LandSlope      1168 non-null   object 
 11  Neighborhood   1168 non-null   object 
 12  Condition1     1168 non-null   object 
 13  Condition2     1168 non-null   object 
 14  BldgType       1168 non-null   object 
 15  HouseStyle     1168 non-null   object 
 16  OverallQual    1168 non-null   int64  
 17  OverallCond    1168 non-null   int64  
 18  YearBui

In [20]:
X_num = X.select_dtypes(include="number").columns
X_cat = X.select_dtypes(exclude="number").columns

In [21]:
X_num

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [23]:
# define the two pipelines
num_pipe = make_pipeline(SimpleImputer(strategy="mean"))
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
    )

In [24]:
# bring both pipelines together
preprocessor = ColumnTransformer(transformers=[("num", num_pipe, X_num), ("cat", cat_pipe, X_cat)])

In [25]:
preprocessor

In [26]:
model = RandomForestRegressor()

In [27]:
pipeline = make_pipeline(preprocessor, model)

In [28]:
pipeline

In [29]:
pipeline.fit(X_train, y_train)

In [30]:
prediction_train = pipeline.predict(X_train)

In [31]:
prediction_train

array([323525.97, 107193.3 , 151872.  , ..., 122836.86, 216781.29,
       225653.16])

In [32]:
prediction_test = pipeline.predict(X_test)



In [33]:
prediction_test

array([219059.55, 148472.67, 106466.42, 221371.28,  91612.62, 109343.56,
       259796.59, 123516.5 , 470727.23, 156558.84, 203929.46, 146241.83,
       223470.15, 111819.76, 124883.13, 147230.75, 231024.28, 119221.5 ,
       144511.93, 193929.49, 132003.91, 141952.93, 105901.4 , 165438.21,
       177340.  , 223997.59, 171088.64,  85820.87, 346966.06, 116028.  ,
       136305.29, 195551.76, 141015.7 , 292829.97, 328492.03, 184272.14,
       294384.44, 125078.64, 238251.29, 303189.19, 209351.39, 125868.14,
       181993.76, 292118.82, 335176.12, 144065.38, 125270.53, 125638.43,
       169665.  , 105821.88, 379796.79, 143542.1 , 168593.01,  91526.06,
       249218.81, 113662.21, 138777.92, 241153.75, 140353.08, 104191.74,
       144369.6 , 133352.5 , 144713.5 , 151897.  , 195368.78, 155369.  ,
       129099.16, 210129.38, 125889.57, 188960.58, 180213.59, 120272.15,
        87023.65, 226442.9 ,  96329.34, 260448.33, 131479.  , 107940.37,
       277787.1 , 154176.34, 137542.46, 130289.33, 

In [34]:
performance_train = r2_score(y_true = y_train, y_pred=prediction_train)

In [35]:
performance_train

0.981096396701848

In [36]:
performance = r2_score(y_test, prediction_test)

In [37]:
performance

0.8280608219573444

Make a prediction with the validation data from kaggle

In [38]:
prediction_val = pipeline.predict(val_data)



In [39]:
prediction_val

array([124848.66, 152454.  , 181541.4 , ..., 150647.61, 111401.13,
       226716.87])

Add the predictions as a new column to the full validations df

In [40]:
val_data_full["SalePrice"] = prediction_val

In [41]:
submission = val_data_full[["Id", "SalePrice"]]

In [42]:
submission

Unnamed: 0,Id,SalePrice
0,1461,124848.66
1,1462,152454.00
2,1463,181541.40
3,1464,183202.90
4,1465,197859.00
...,...,...
1454,2915,87940.00
1455,2916,86217.58
1456,2917,150647.61
1457,2918,111401.13


In [43]:
submission.to_csv("submission.csv", index=False)

In [104]:
# only needed for csv downloads from google colab
#from google.colab import files
#files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>