In [5]:
import pandas as pd

X_full = pd.read_csv('melbourne_train.csv', index_col='Id')
X_test_full = pd.read_csv('melbourne_test.csv', index_col='Id')

# remove rows with missing target
X_full.dropna(axis=0,subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'],axis=1, inplace=True)


In [7]:
from sklearn.model_selection import train_test_split
# split data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

In [14]:
# select columns with low cardinality
categorical_cols = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == 'object']

# select numerical columns
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64','float64']]


In [19]:
my_cols = categorical_cols + numerical_cols
len(my_cols)

76

In [24]:
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [33]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# preprocessing for numerical data
numerical_transfomer = SimpleImputer(strategy='constant')

# preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle processing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_transfomer,numerical_cols),
        ('cat', categorical_transformer,categorical_cols)
    ])

In [34]:
# define model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [35]:
from sklearn.metrics import mean_absolute_error

# bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate model
score = mean_absolute_error(y_valid,preds)
print('MAE',score)

MAE 17861.780102739725


In [80]:
(X_full.dtypes == 'int64')

False

In [44]:
X_full.GarageYrBlt.unique()

array([2003., 1976., 2001., 1998., 2000., 1993., 2004., 1973., 1931.,
       1939., 1965., 2005., 1962., 2006., 1960., 1991., 1970., 1967.,
       1958., 1930., 2002., 1968., 2007., 2008., 1957., 1920., 1966.,
       1959., 1995., 1954., 1953.,   nan, 1983., 1977., 1997., 1985.,
       1963., 1981., 1964., 1999., 1935., 1990., 1945., 1987., 1989.,
       1915., 1956., 1948., 1974., 2009., 1950., 1961., 1921., 1900.,
       1979., 1951., 1969., 1936., 1975., 1971., 1923., 1984., 1926.,
       1955., 1986., 1988., 1916., 1932., 1972., 1918., 1980., 1924.,
       1996., 1940., 1949., 1994., 1910., 1978., 1982., 1992., 1925.,
       1941., 2010., 1927., 1947., 1937., 1942., 1938., 1952., 1928.,
       1922., 1934., 1906., 1914., 1946., 1908., 1929., 1933.])

In [78]:
import numpy as np
X_full[X_full.GarageYrBlt == 1914]

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
736,75,RM,60.0,10800,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,10,2006,WD,Normal
1236,70,RL,96.0,13132,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,7,2006,WD,Normal
