In [86]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data exploration

## Stratified sampling?

In [87]:
train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv",index_col='Id')
test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv",index_col='Id')

In [88]:
train.head()

In [89]:
train.info()

In [90]:
test.info()

In [91]:
train.describe()

In [92]:
test.describe()

In [93]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [94]:
%matplotlib inline
import matplotlib.pyplot as plt
train.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

In [95]:
train.plot(kind="scatter", figsize=(30,20), x="Neighborhood", y="SalePrice", alpha=0.4)
plt.legend()
save_fig("housing_prices_scatterplot")

In [96]:
corr_matrix = train.corr()

In [97]:
corr_matrix["SalePrice"].sort_values(ascending=False)

In [98]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["SalePrice", "OverallQual", "GrLivArea",
              "GarageCars", "GarageArea", "TotalBsmtSF",
              "1stFlrSF", "FullBath","TotRmsAbvGrd", "YearBuilt"]
scatter_matrix(train[attributes], figsize=(20, 10))
save_fig("scatter_matrix_plot")

In [99]:
housing = train.copy()

In [100]:
housing_tr = housing.copy()


In [101]:
# housing_tr = housing.dropna(thresh=1459, axis=1)
# housing_tr = housing_tr.drop("Electrical", axis=1)

In [102]:
housing_tr.head()

In [103]:
housing_labels = housing_tr["SalePrice"].copy()
housing_tr = housing_tr.drop("SalePrice", axis=1) # drop labels for training set


In [104]:
sample_incomplete_rows = housing_tr[housing_tr.isnull().any(axis=1)]
sample_incomplete_rows.info()

In [105]:
housing_num = housing_tr.select_dtypes(include=[np.number])

In [106]:
housing_cat = housing_tr.select_dtypes(include=['object'])

In [107]:
housing_num.head()

In [108]:
housing_cat.head()

In [109]:
housing_cat.columns.values

In [110]:
list(housing_cat)

# Transformation Pipelineshousing_cat.columns.valueshousing_cat.columns.values

In [111]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

In [112]:
imputer.fit(housing_num)

In [113]:
imputer.statistics_

In [114]:
housing_num.median().values

In [115]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [116]:
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="constant")),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
#         ('std_scaler', StandardScaler()),
    ])
housing_cat_tr = cat_pipeline.fit_transform(housing_cat)

In [117]:
housing_cat_tr

In [118]:
# housing_tr.info()

In [119]:

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = list(housing_cat)

full_pipeline = ColumnTransformer(transformers=[
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
#         ("cat2", OneHotEncoder(handle_unknown='ignore'), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing_tr)

In [120]:
pd.DataFrame(housing_prepared).info()

In [121]:
housing_prepared

In [122]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [123]:
# let's try the full preprocessing pipeline on a few training instances
some_data = housing_tr.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

In [124]:
print("Labels:", list(some_labels))

In [125]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [126]:
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

In [127]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [128]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [129]:
len(housing_predictions)

In [130]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

In [131]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [132]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

In [133]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [134]:
test.info()

In [135]:
housing_test = test.copy()
# housing_test = test.dropna(thresh=1458, axis=1)
# housing_test = housing_test.drop("Electrical", axis=1)
housing_test.head()

In [136]:
housing_test.info()

In [137]:
len(housing_test)

In [138]:
final_model = grid_search.best_estimator_

X_test = housing_test.copy()
# y_test = test["SalePrice"].copy()

X_test_prepared = full_pipeline.transform(X_test)


In [139]:
final_predictions = final_model.predict(X_test_prepared)

final_predictions

In [140]:
# final_mse = mean_squared_error(y_test, final_predictions)
# final_rmse = np.sqrt(final_mse)

In [141]:
# final_rmse

In [142]:
# t=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [143]:
# predictions = tree_reg.predict(t)

In [144]:


# final_predictions=np.mean(np.column_stack(predictions), axis=1)



In [145]:
t=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
Df = pd.DataFrame({'Id':t['Id'].apply(int), 'SalePrice':(final_predictions)})

In [146]:
Df.to_csv('submission.csv', index=False)

In [147]:
Df