In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data exploration

## Stratified sampling?

In [4]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv",index_col='Id')
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv",index_col='Id')

sample_submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
with open('../input/house-prices-advanced-regression-techniques/data_description.txt') as f:
    data_description = f.readlines()

ValueError: Index Id invalid

In [9]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

sample_submission.to_csv('sample_submission.csv', index=False)

textfile = open("data_description.txt", "w")
for element in data_description:
    textfile.write(element + "\n")
textfile.close()

In [3]:
train.head()

In [4]:
train.info()

In [5]:
test.info()

In [6]:
train.describe()

In [7]:
test.describe()

In [8]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
train.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

In [10]:
train.plot(kind="scatter", figsize=(30,20), x="Neighborhood", y="SalePrice", alpha=0.4)
plt.legend()
save_fig("housing_prices_scatterplot")

In [11]:
corr_matrix = train.corr()

In [12]:
corr_matrix["SalePrice"].sort_values(ascending=False)

In [13]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["SalePrice", "OverallQual", "GrLivArea",
              "GarageCars", "GarageArea", "TotalBsmtSF",
              "1stFlrSF", "FullBath","TotRmsAbvGrd", "YearBuilt"]
scatter_matrix(train[attributes], figsize=(20, 10))
save_fig("scatter_matrix_plot")

# Dataset split

In [14]:
housing = train.copy()

In [15]:
housing_tr = housing.copy()


In [16]:
housing_tr.head()

In [17]:
housing_labels = housing_tr["SalePrice"].copy()
housing_tr = housing_tr.drop("SalePrice", axis=1) # drop labels for training set


## NA check

In [18]:
sample_incomplete_rows = housing_tr[housing_tr.isnull().any(axis=1)]
sample_incomplete_rows.info()

In [19]:
housing_num = housing_tr.select_dtypes(include=[np.number])

In [20]:
housing_cat = housing_tr.select_dtypes(include=['object'])

In [21]:
housing_num.head()

In [22]:
housing_cat.head()

In [23]:
housing_cat.columns.values

In [24]:
list(housing_cat)

# Transformation Pipelines

In [25]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [26]:
imputer.fit(housing_num)

In [27]:
imputer.statistics_

In [28]:
housing_num.median().values

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [30]:
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant")),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
#         ('std_scaler', StandardScaler()),
    ])
housing_cat_tr = cat_pipeline.fit_transform(housing_cat)

In [31]:
housing_cat_tr

In [32]:
# housing_tr.info()

In [33]:

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = list(housing_cat)

full_pipeline = ColumnTransformer(transformers=[
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
#         ("cat2", OneHotEncoder(handle_unknown='ignore'), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing_tr)

In [34]:
pd.DataFrame(housing_prepared).info()

In [35]:
housing_prepared

# Select and Train a Model

Linear regression baseline

In [36]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [37]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing_tr.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

In [38]:
print("Labels:", list(some_labels))

In [39]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [40]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

## DecisionTreeRegressor

In [41]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [42]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [43]:
len(housing_predictions)

## GradientBoostingClassifier

In [44]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor(n_estimators=200, learning_rate=0.01, max_depth=5, random_state=42,
        ).fit(housing_prepared, housing_labels)

In [45]:
housing_predictions = reg.predict(housing_prepared)
gradient_mse = mean_squared_error(housing_labels, housing_predictions)
gradient_rmse = np.sqrt(gradient_mse)
gradient_rmse

In [46]:
from xgboost import XGBRegressor
xgb = XGBRegressor(objective = "reg:squarederror", max_depth=8, n_estimators=360, seed=0, booster = "dart", rate_drop = 0.1,
         skip_drop = 0.5).fit(housing_prepared, housing_labels)

In [47]:
housing_predictions = xgb.predict(housing_prepared)
xgb_mse = mean_squared_error(housing_labels, housing_predictions)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse


# Fine-Tune Your Model

## Grid search

In [48]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor



In [49]:
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [30, 100, 150, 200], 'max_features': [4, 5, 6, 8, 9]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [50]:
grid_search.best_params_

In [51]:
grid_search.best_estimator_

Let's look at the score of each hyperparameter combination tested during the grid search:

In [52]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [53]:
pd.DataFrame(grid_search.cv_results_)

## Randomized Search

In [54]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from scipy.stats import uniform



In [55]:
# # reg = GradientBoostingRegressor(n_estimators=200, learning_rate=0.01, max_depth=5, random_state=42,
# param_distribs = {
#         'n_estimators': randint(low=50, high=200),
#         'max_depth': randint(low=1, high=12),
#     }

# gradient_reg = GradientBoostingRegressor(random_state=42)
# # train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
# grid_gradient_search = RandomizedSearchCV(gradient_reg, param_distributions=param_distribs,
#                                 n_iter=200, cv=5, scoring='neg_mean_squared_error', random_state=42)
# grid_gradient_search.fit(housing_prepared, housing_labels)

In [56]:
# cvres = grid_gradient_search.cv_results_
# for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
#     print(np.sqrt(-mean_score), params)

In [57]:
param_distribs = {
        'n_estimators': randint(low=90, high=150),
        'max_features': randint(low=4, high=12),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

In [58]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

## Analyze the Best Models and Their Errors

In [59]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [60]:
# extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
# cat_encoder = full_pipeline.named_transformers_["cat"]
# cat_one_hot_attribs = list(cat_encoder.categories_[0])
# attributes = num_attribs + cat_one_hot_attribs
# sorted(zip(feature_importances, attributes), reverse=True)

In [61]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [62]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

# Cross-Validation

In [63]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [64]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [65]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [66]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

In [67]:
scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

In [68]:
from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

# Fine-tune

## Evaluate Your System on the Test Set

# Final model

In [69]:
housing_test = test.copy()
# housing_test = test.dropna(thresh=1458, axis=1)
# housing_test = housing_test.drop("Electrical", axis=1)
housing_test.head()

In [70]:
housing_test.info()

In [71]:
len(housing_test)

In [72]:
final_model = xgb #rnd_search.best_estimator_

X_test = housing_test.copy()
# y_test = test["SalePrice"].copy()

# X_test_prepared = full_pipeline.transform(X_test) #old


In [73]:
# housing_labels

## A full pipeline with both preparation and prediction

In [74]:
full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("final_model", final_model)
    ])

# Training
full_pipeline_with_predictor.fit(housing_tr, housing_labels)
# Testing
final_predictions = full_pipeline_with_predictor.predict(X_test)

In [75]:
# final_predictions = final_model.predict(X_test_prepared) #old

final_predictions

In [76]:
# final_mse = mean_squared_error(y_test, final_predictions)
# final_rmse = np.sqrt(final_mse)

In [77]:
# predictions = tree_reg.predict(t)

In [78]:
# final_predictions=np.mean(np.column_stack(predictions), axis=1)

## Model persistence using joblib

In [79]:
my_model = full_pipeline_with_predictor

In [80]:
import joblib
joblib.dump(my_model, "my_model.pkl") # DIFF
#...
my_model_loaded = joblib.load("my_model.pkl") # DIFF

In [81]:
from scipy.stats import geom, expon
geom_distrib=geom(0.5).rvs(10000, random_state=42)
expon_distrib=expon(scale=1).rvs(10000, random_state=42)
plt.hist(geom_distrib, bins=50)
plt.show()
plt.hist(expon_distrib, bins=50)
plt.show()

## Adding a transformer in the preparation pipeline to select only the most important attributes.

In [82]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

Note: this feature selector assumes that you have already computed the feature importances somehow (for example using a `RandomForestRegressor`). You may be tempted to compute them directly in the `TopFeatureSelector`'s `fit()` method, however this would likely slow down grid/randomized search since the feature importances would have to be computed for every hyperparameter combination (unless you implement some sort of cache).

Let's define the number of top features we want to keep:

In [83]:
k = 5

In [84]:
# feature_importances

Now let's look for the indices of the top k features:

In [85]:
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

In [86]:
# np.array(attributes)[top_k_feature_indices]

Let's double check that these are indeed the top k features:

In [87]:
attributes

In [88]:
sorted(zip(feature_importances, attributes), reverse=True)[:k]

# Submission

In [89]:
t=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
Df = pd.DataFrame({'Id':t['Id'].apply(int), 'SalePrice':(final_predictions)})

In [90]:
Df.to_csv('submission.csv', index=False)

In [91]:
Df