In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [2]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

SEED = 42

In [3]:
df_train.columns

Index(['address', 'sold_price', 'summary', 'type', 'year_built', 'heating',
       'cooling', 'parking', 'bedrooms', 'bathrooms', 'full_bathrooms',
       'total_interior_livable_area', 'total_spaces', 'garage_spaces',
       'region', 'elementary_school', 'elementary_school_score',
       'elementary_school_distance', 'middle_school', 'middle_school_score',
       'middle_school_distance', 'high_school', 'high_school_score',
       'high_school_distance', 'flooring', 'heating_features',
       'cooling_features', 'appliances_included', 'laundry_features',
       'parking_features', 'tax_assessed_value', 'annual_tax_amount',
       'listed_on', 'listed_price', 'last_sold_on', 'last_sold_price', 'city',
       'zip', 'state', 'id'],
      dtype='object')

In [4]:
numeric_cols = [
    'bathrooms', 'full_bathrooms', 'total_interior_livable_area', 'total_spaces', 'garage_spaces', 
    'elementary_school_score', 'elementary_school_distance', 'middle_school_score', 'middle_school_distance', 
    'high_school_score', 'high_school_distance', 'tax_assessed_value', 'listed_price', 
    'last_sold_price', 'year_built', 'annual_tax_amount'
]
cat_cols = [
    'flooring', 'high_school', 'parking', 'zip', 'cooling', 'region', 'bedrooms', 'cooling_features', 'city', 'heating_features', 'middle_school', 'type', 'laundry_features', 'state', 'heating', 'elementary_school', 'appliances_included', 'parking_features'
]

target_cols = ['sold_price']
id_cols = ['id']
text_cols = list(set(df_train.columns) - set(numeric_cols) - set(cat_cols) - set(target_cols) - set(id_cols))

# Sample

Start with numeric columns only:

In [5]:
train_num_df = df_train[numeric_cols].fillna(df_train[numeric_cols].mean(axis=0))
test_num_df = df_test[numeric_cols].fillna(df_test[numeric_cols].mean(axis=0))

X_train_num, Y_train = train_num_df.values, df_train[target_cols].values
X_test_num = test_num_df.values

Scale to succeed:

In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train_num)
X_test = sc.fit_transform(X_test_num)

One-hot encode cities:

In [7]:
cities_train = df_train[["city"]].astype(str)
cities_test = df_test[["city"]].astype(str)
all_cities = pd.concat([cities_train, cities_test])

ohe_cities = pd.get_dummies(all_cities[["city"]])
train_ohe_cities = ohe_cities[:len(cities_train)]
test_ohe_cities = ohe_cities[len(cities_train):]

X_train_city = np.hstack([X_train, train_ohe_cities.values])
X_test_city = np.hstack([X_test, test_ohe_cities.values])

Let's pick a linear model:

In [8]:
lr_model = Lasso(alpha=1e-5, max_iter=1e4, random_state=SEED)
print(-np.mean(cross_val_score(lr_model, X_train_city, Y_train, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)))

0.2526204147091029


In [9]:
lr_model.fit(X_train_city, Y_train)

Y_train_pred = lr_model.predict(X_train_city)
np.sqrt(mean_squared_error(Y_train, Y_train_pred))

0.23905671831363895

In [10]:
Y_pred = lr_model.predict(X_test_city)

df_submit = df_test[["id"]].copy()
df_submit.loc[:, "sold_price"] = Y_pred
df_submit.to_csv("sample_submission.csv", index=False)

# Medium Baseline

In [11]:
X_train = df_train.drop(target_cols + id_cols + text_cols, axis=1)
X_train[numeric_cols] = X_train[numeric_cols].fillna(X_train[numeric_cols].median(axis=0))
X_train[cat_cols] = X_train[cat_cols].fillna('')

Y_train = df_train[target_cols]

X_test = df_test.drop(id_cols + text_cols, axis=1)
X_test[numeric_cols] = X_test[numeric_cols].fillna(X_test[numeric_cols].median(axis=0))
X_test[cat_cols] = X_test[cat_cols].fillna('')

categorical_features_indices = np.where(X_train.dtypes != float)[0]

In [15]:
cbr = CatBoostRegressor(random_seed=SEED)
cbr.fit(X_train, Y_train, cat_features=categorical_features_indices)

Learning rate set to 0.072727
0:	learn: 0.7469509	total: 130ms	remaining: 2m 10s
1:	learn: 0.7028942	total: 412ms	remaining: 3m 25s
2:	learn: 0.6609596	total: 724ms	remaining: 4m
3:	learn: 0.6223374	total: 911ms	remaining: 3m 46s
4:	learn: 0.5863194	total: 1.12s	remaining: 3m 42s
5:	learn: 0.5539612	total: 1.34s	remaining: 3m 41s
6:	learn: 0.5234996	total: 1.62s	remaining: 3m 50s
7:	learn: 0.4953340	total: 1.86s	remaining: 3m 50s
8:	learn: 0.4694933	total: 2.08s	remaining: 3m 49s
9:	learn: 0.4467373	total: 2.29s	remaining: 3m 46s
10:	learn: 0.4247114	total: 2.48s	remaining: 3m 43s
11:	learn: 0.4051879	total: 2.65s	remaining: 3m 38s
12:	learn: 0.3864454	total: 2.81s	remaining: 3m 33s
13:	learn: 0.3699470	total: 3.01s	remaining: 3m 32s
14:	learn: 0.3548487	total: 3.17s	remaining: 3m 28s
15:	learn: 0.3411682	total: 3.34s	remaining: 3m 25s
16:	learn: 0.3293353	total: 3.51s	remaining: 3m 22s
17:	learn: 0.3178082	total: 3.71s	remaining: 3m 22s
18:	learn: 0.3077495	total: 3.86s	remaining: 3m 

<catboost.core.CatBoostRegressor at 0x18845bedaf0>

In [16]:
Y_pred = cbr.predict(X_test)
df_submit = df_test[["id"]].copy()
df_submit.loc[:, "sold_price"] = Y_pred
df_submit.to_csv("medium_baseline_submission.csv", index=False)