- [House Prices - Advanced Regression Techniques - Kaggle page](https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

DATA_DIR = Path('../data/house-prices')
filepath = DATA_DIR / 'train.csv'

target_column = 'SalePrice'

# Dataset

- Download dataset from kaggle and store it in the data folder

In [None]:
!kaggle competitions download -c house-prices-advanced-regression-techniques
!mkdir ../data/house-prices
!unzip -o house-prices-advanced-regression-techniques.zip -d ../data/house-prices
!rm house-prices-advanced-regression-techniques.zip

- Read dataset csv file

In [None]:
df_master = pd.read_csv(filepath, index_col='Id')
df_master.head()

- Read dataset description

In [None]:
with open(DATA_DIR / 'data_description.txt') as f:
    desc = f.read()
    print(desc)

# Preprocessing

In [None]:
df = df_master.copy()

## Continuous data

In [None]:
df.select_dtypes(include='number')

In [None]:
for column_name in df.select_dtypes(include='number'):
    df[column_name] = df[column_name].fillna(0)

- Check if some numerical data are realy categorical data **TODO**

In [None]:
# TODO

## Categorical data

### Get columns with categorical features

- Get columns with object type

In [None]:
df.select_dtypes(include='object')

In [None]:
column_type_dict = df.columns.to_series().groupby(df.dtypes).groups
print(column_type_dict.keys())
column_type_dict

- Check if object type columns aare realy categorical variables

In [None]:
df_objects = df.select_dtypes(include='object')
for column_name in df_objects:
    print('\n', column_name, '='*10, len(df_objects[column_name].unique()))
    print(df_objects[column_name].unique())
    print(df_objects[column_name].value_counts().to_dict())

In [None]:
categorical_columns = list(df.select_dtypes(include='object').columns)
len(categorical_columns), categorical_columns[:10]

### Impute missing values

- Fill missing values of categorical data with Nan

In [None]:
for column_name in categorical_columns:
    df[column_name] = df[column_name].fillna(np.nan)

### Dummify columns

In [None]:
df_dummified = pd.get_dummies(df, columns=categorical_columns)
df_dummified.head()

# Feature engineering

## Split dataset in train and test sets

In [None]:
final_df = df_dummified

In [None]:
X, y = final_df.loc[:, final_df.columns != target_column], final_df[target_column]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
X_train.shape, y_test.shape

# Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR

#model = LogisticRegression(random_state=42)
model = SVR(C=1.0, epsilon=0.2)
model.fit(X_train, y_train)

# Evaluation

In [None]:
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [None]:
y_pred = model.predict(X_test)
compute_rmsle(y_test, y_pred)

# Inference

# Submission
- [Kaggle AAPI usage](https://www.kaggle.com/docs/api)

In [None]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f ../data/house-prices/sample_submission.csv -m my_submission

In [None]:
!kaggle competitions submissions -c house-prices-advanced-regression-techniques