In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from statistics import mode

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression

In [3]:
# Read the data
train_data = pd.read_csv('train.csv', index_col='Id')
test_data = pd.read_csv('test.csv', index_col='Id')

In [4]:
"""
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

"""

'\n# Set Matplotlib defaults\nplt.style.use("seaborn-whitegrid")\nplt.rc("figure", autolayout=True)\nplt.rc(\n    "axes",\n    labelweight="bold",\n    labelsize="large",\n    titleweight="bold",\n    titlesize=14,\n    titlepad=10,\n)\n\n# Utility functions from Tutorial\ndef make_mi_scores(X, y):\n    X = X.copy()\n    for colname in X.select_dtypes(["object", "category"]):\n        X[colname], _ = X[colname].factorize()\n    # All discrete features should now have integer dtypes\n    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]\n    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)\n    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)\n    mi_scores = mi_scores.sort_values(ascending=False)\n    return mi_scores\n\n\ndef plot_mi_scores(scores):\n    scores = scores.sort_values(ascending=True)\n    width = np.arange(len(scores))\n    ticks = list(scores.index)\n    plt.barh(width, scores)\n    p

In [5]:
# Number of missing values in each column of training data
#missing_val_count_by_column = (train_data.isnull().sum())
#print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [6]:
#missing_val_count_by_column = (test_data.isnull().sum())
#print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [7]:
train_data.rename({'1stFlrSF': 'FirstFlrSF', '2ndFlrSF': 'SecFlrSF', '3SsnPorch': 'ThreeSsnPorch'}, axis=1, inplace=True)
test_data.rename({'1stFlrSF': 'FirstFlrSF', '2ndFlrSF': 'SecFlrSF', '3SsnPorch': 'ThreeSsnPorch'}, axis=1, inplace=True)

#X_train_clean.loc[X_train_clean['MasVnrType'] == "None"]

drop_columns = ['Alley', 'Utilities']
train_data.drop(drop_columns, axis=1, inplace=True)
test_data.drop(drop_columns, axis=1, inplace=True)


train_data.loc[train_data['MSZoning'] == "C (all)", 'MSZoning'] = "C"
test_data.loc[test_data['MSZoning'] == "C (all)", 'MSZoning'] = "C"

train_data['SaleType'].fillna(value="Oth", inplace=True)
test_data['SaleType'].fillna(value="Oth", inplace=True)

train_data['Functional'].fillna(value="Typ", inplace=True)
test_data['Functional'].fillna(value="Typ", inplace=True)


for column in train_data[['MSZoning', 'Electrical', 'KitchenQual']]:
    train_data[column].fillna(train_data[column].mode()[0], inplace=True)

for column in test_data[['MSZoning', 'Electrical', 'KitchenQual']]:
    test_data[column].fillna(test_data[column].mode()[0], inplace=True)

train_data['LotFrontage'].fillna(train_data['LotFrontage'].mean(), inplace=True)
test_data['LotFrontage'].fillna(test_data['LotFrontage'].mean(), inplace=True)


for column in train_data[['MasVnrType', 'MiscFeature']]:
    train_data[column].fillna(value="None", inplace=True)
for column in test_data[['MasVnrType', 'MiscFeature']]:
    test_data[column].fillna(value="None", inplace=True)
    
for column in train_data[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                          'BsmtFinType2', 'GarageType', 'GarageFinish', 
                          'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'FireplaceQu', 
                         ]]:
    train_data[column].fillna(value="NA", inplace=True)
    
for column in test_data[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                          'BsmtFinType2', 'GarageType', 'GarageFinish', 
                          'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'FireplaceQu', 
                         ]]:
    test_data[column].fillna(value="NA", inplace=True)
    
for column in train_data[['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                          'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 
                          'GarageArea', 'TotalBsmtSF',
                         ]]:
    train_data[column].fillna(value=0, inplace=True)
    
for column in test_data[['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                          'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 
                          'GarageArea', 'TotalBsmtSF',
                         ]]:
    test_data[column].fillna(value=0, inplace=True)

In [8]:
#features = ["YearBuilt", "MoSold", "ScreenPorch"]
#sns.relplot(
#    x="value", y="SalePrice", col="variable", data=df.melt(id_vars="SalePrice", value_vars=features), facet_kws=dict(sharex=False),
#);

#sns.catplot(x="BldgType", y="SalePrice", data=df, kind="boxen");

# YOUR CODE HERE: 
#feature = "GrLivArea"

#sns.lmplot(
#    x=feature, y="SalePrice", hue="BldgType", col="BldgType",
#    data=df, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=5,
#);

In [9]:
#X = train_data.copy()
#y = X.pop('SalePrice')

#mi_scores = make_mi_scores(X, y)

In [10]:
#print(mi_scores.head(20))
#print(mi_scores.tail(20))  # uncomment to see bottom 20

#plt.figure(dpi=100, figsize=(8, 5))
#plot_mi_scores(mi_scores.head(20))
#plt.figure(dpi=100, figsize=(8, 5))
#plot_mi_scores(mi_scores.tail(20))  # uncomment to see bottom 20

In [11]:

#train_data.loc[train_data['MSZoning'] == "C"]
#test_data.loc[test_data['SaleType'].isnull()]
#test_data['MSZoning'].fillna(test_data['MSZoning'].mode()[0], inplace=True)
#train_data.loc[train_data.index == 2251]
#test_data.loc[test_data.index == 1848]

In [12]:
#train_data.info()
#test_data.info()
#train_missing_val_count_by_column = (train_data.isnull().sum())
#test_missing_val_count_by_column = (test_data.isnull().sum())
#print(train_missing_val_count_by_column[train_missing_val_count_by_column > 0])
#print(test_missing_val_count_by_column[test_missing_val_count_by_column > 0])
#print(train_data.shape)
#print(test_data.shape)
#train_data.describe()
#test_data.info()
# Number of missing values in each column of training data

##missing_val_count_by_column = (train_data.isnull().sum())
##print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [13]:
#print(train_data.shape)
#print(test_data.shape)
#train_data.info()

#missing_val_count_by_column = (test_data.isnull().sum())
#print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [14]:
# Remove rows with missing target, separate target from predictors
# Drop rows with missing SalePrice from the Train dataset
#train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

#Separate SalesPrice from the Train dataset
y = train_data.SalePrice
train_data.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
#X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
#                                                                train_size=0.8, test_size=0.2,
#                                                                random_state=1)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in train_data.columns if
                   train_data[cname].nunique() < 10 and 
                   train_data[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in train_data.columns if 
                train_data[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = train_data[my_cols].copy()
#X_valid = X_valid_full[my_cols].copy()
X_test = test_data[my_cols].copy()
#X_train_clean = train_data[my_cols].copy()
#X_test_clean = test_data[my_cols].copy()

In [15]:
print(X_test.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_test.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

X_test.describe()

(1459, 74)
Series([], dtype: int64)


Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,ThreeSsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,...,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,99.673749,438.902673,52.583276,...,472.444825,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,42.74688,20.561228,4955.517327,1.436812,1.11374,30.390071,21.130467,177.001792,455.257119,176.698671,...,217.326902,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,60.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,0.0,...,317.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,50.0,68.580357,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.0,0.0,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,78.0,11517.5,7.0,6.0,2001.0,2004.0,162.0,752.0,0.0,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,1526.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [16]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1460, 74)
Series([], dtype: int64)


In [17]:
X_test.head(10)

Unnamed: 0_level_0,MSZoning,Street,LotShape,LandContour,LotConfig,LandSlope,Condition1,Condition2,BldgType,HouseStyle,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,ThreeSsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,RH,Pave,Reg,Lvl,Inside,Gtl,Feedr,Norm,1Fam,1Story,...,730.0,140,0,0,0,120,0,0,6,2010
1462,RL,Pave,IR1,Lvl,Corner,Gtl,Norm,Norm,1Fam,1Story,...,312.0,393,36,0,0,0,0,12500,6,2010
1463,RL,Pave,IR1,Lvl,Inside,Gtl,Norm,Norm,1Fam,2Story,...,482.0,212,34,0,0,0,0,0,3,2010
1464,RL,Pave,IR1,Lvl,Inside,Gtl,Norm,Norm,1Fam,2Story,...,470.0,360,36,0,0,0,0,0,6,2010
1465,RL,Pave,IR1,HLS,Inside,Gtl,Norm,Norm,TwnhsE,1Story,...,506.0,0,82,0,0,144,0,0,1,2010
1466,RL,Pave,IR1,Lvl,Corner,Gtl,Norm,Norm,1Fam,2Story,...,440.0,157,84,0,0,0,0,0,4,2010
1467,RL,Pave,IR1,Lvl,Inside,Gtl,Norm,Norm,1Fam,1Story,...,420.0,483,21,0,0,0,0,500,3,2010
1468,RL,Pave,IR1,Lvl,Inside,Gtl,Norm,Norm,1Fam,2Story,...,393.0,0,75,0,0,0,0,0,5,2010
1469,RL,Pave,Reg,Lvl,Inside,Gtl,Norm,Norm,1Fam,1Story,...,506.0,192,0,0,0,0,0,0,2,2010
1470,RL,Pave,Reg,Lvl,Corner,Gtl,Norm,Norm,1Fam,1Story,...,525.0,240,0,0,0,0,0,0,4,2010


In [18]:
"""

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

"""

"\n\n# Preprocessing for numerical data\nnumerical_transformer = SimpleImputer(strategy='constant')\n\n# Preprocessing for categorical data\ncategorical_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='most_frequent')),\n    ('onehot', OneHotEncoder(handle_unknown='ignore'))\n])\n\n# Bundle preprocessing for numerical and categorical data\npreprocessor = ColumnTransformer(\n    transformers=[\n        ('num', numerical_transformer, numerical_cols),\n        ('cat', categorical_transformer, categorical_cols)\n    ])\n\n# Define model\nmodel = RandomForestRegressor(n_estimators=100, random_state=0)\n\n# Bundle preprocessing and modeling code in a pipeline\nclf = Pipeline(steps=[('preprocessor', preprocessor),\n                      ('model', model)\n                     ])\n\n# Preprocessing of training data, fit model \nclf.fit(X_train, y_train)\n\n# Preprocessing of validation data, get predictions\npreds = clf.predict(X_valid)\n\nprint('MAE:', mean_absolute_erro

In [19]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')# Your code here

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]) # Your code here

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
#model = RandomForestRegressor(n_estimators=52, random_state=1) # Your code here

In [20]:
# make sure to comment this out, before submitting 
"""
import optuna

def objective(trial):
       
    xgb_params_2 = dict(
        random_state=trial.suggest_int("random_state", 0, 2), 
        num_parallel_tree=trial.suggest_int("num_parallel_tree", 1, 3), 
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        
    )
    xgb_2 = XGBRegressor(**xgb_params_2) #XGBRegressor
    
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_2)
                             ])
    
    #my_pipeline.fit(X_train, y)
    #preds = my_pipeline.predict(X_test)
    #score = mean_absolute_error(y, preds) 
    #print('MAE:', score)
    #return score
   # return score_dataset(X_train, y_train, xgb)
    
    scores = -1 * cross_val_score(my_pipeline, X_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    
    return scores.mean()


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
#xgb_params_2 = study.best_params
"""

'\nimport optuna\n\ndef objective(trial):\n       \n    xgb_params_2 = dict(\n        random_state=trial.suggest_int("random_state", 0, 2), \n        num_parallel_tree=trial.suggest_int("num_parallel_tree", 1, 3), \n        max_depth=trial.suggest_int("max_depth", 2, 10),\n        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),\n        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),\n        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),\n        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),\n        subsample=trial.suggest_float("subsample", 0.2, 1.0),\n        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),\n        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),\n        \n    )\n    xgb_2 = XGBRegressor(**xgb_params_2) #XGBRegressor\n    \n    my_pipeline = Pipeline(steps=[(\'preprocessor\', preprocessor),\n                              (\'model\', xgb_2)\n              

In [21]:
#trial = study.best_trial
#print('Accuracy: {}'.format(trial.value))
#print("Best hyperparameters: {}".format(trial.params))

In [22]:
# Here is the best result I got so far. 
#Trial 9 finished with value: 14907.946436215754 and parameters: {'random_state': 0, 'num_parallel_tree': 2, 'max_depth': 5, 'learning_rate': 0.0035938340285897575, 'n_estimators': 7278, 'min_child_weight': 1, 'colsample_bytree': 0.3033265735709497, 'subsample': 0.348801301690591, 'reg_alpha': 0.0006595500664829579, 'reg_lambda': 6.451236513351137}. Best is trial 9 with value: 14907.946436215754.
#Accuracy: 13972.736930115581
#Best hyperparameters: {'max_depth': 3, 'learning_rate': 0.047637327146321726, 'n_estimators': 2708, 'min_child_weight': 3, 'colsample_bytree': 0.2090033901858265, 'subsample': 0.8332562804302324, 'reg_alpha': 1.30364086901903, 'reg_lambda': 0.9728728504211933}

xgb_params = dict(random_state=0,
    max_depth=5,           # maximum depth of each tree - try 2 to 10
    learning_rate=0.0035938340285897575,    # effect of each tree - try 0.0001 to 0.1
    n_estimators=7278,     # number of trees (that is, boosting rounds) - try 1000 to 8000
    min_child_weight=1,    # minimum number of houses in a leaf - try 1 to 10
    colsample_bytree=0.3033265735709497,  # fraction of features (columns) per tree - try 0.2 to 1.0
    subsample=0.348801301690591,         # 0.7 fraction of instances (rows) per tree - try 0.2 to 1.0
    reg_alpha=0.0006595500664829579,         # 0.5 L1 regularization (like LASSO) - try 0.0 to 10.0
    reg_lambda=6.451236513351137,        # L2 regularization (like Ridge) - try 0.0 to 10.0                 
    num_parallel_tree=2  # set > 1 for boosted random forests
                         ) # Your code here

my_model = XGBRegressor(**xgb_params)

In [23]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', my_model)
                              ])

In [24]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X_train, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)


MAE scores:
 [13928.22167969 16245.17309236 15262.0822988  13162.25743793
 15988.33901434]


In [25]:
print("Average MAE score:", scores.mean())

#15374 last submit
#14930 best from test cv=15


#15392 clean all with drop alley and utilities
#15379 clean numeric only

#15297 clean numeric with drop alley and utilities

Average MAE score: 14917.21470462329


In [26]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y)

# Preprocessing of test data, fit model
preds_test = my_pipeline.predict(X_test) # Your code here

# Preprocessing of training data, fit model 
#my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
#preds = my_pipeline.predict(X_valid)

# Evaluate the model
#score = mean_absolute_error(y, preds_test)
#print('MAE:', score)

In [27]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

In [28]:
#GridSearchCV 
#ElasticNet()
#KFold
"""
kf = KFold(n_splits=12, shuffle=True, random_state=42)
# Using Cross_validation_Score to get Root mean square error

def cv_rmse(model):
    
    return np.sqrt(-cross_val_score(model, train, price, scoring="neg_mean_squared_error", cv=kf)).mean()
    
# Evaluating model with default parameters and check the rmse value
elastic_net = ElasticNet()
print("RMSE with default parameters :",cv_rmse(elastic_net))
RMSE with default parameters : 0.17519749831464346
# Lets tune Parameters

alpha = [0.001,0.005,0.01,0.05,0.1,0.5,1,5,10,50,100]
l1_ratio = [0.001,0.005,0.01,0.05,0.1,0.5,1,5,10,50,100]

params = {"alpha":alpha,"l1_ratio":l1_ratio}
# Apply GridSearchCV to get optimal Parameters

grid_cv = GridSearchCV(estimator=elastic_net,
                       param_grid=params,
                       cv=kf,
                       refit=True,
                       verbose=1)
                       
# Fitting the gridsearchcv model and get the best parameters

grid_cv.fit(train,price)
Fitting 12 folds for each of 121 candidates, totalling 1452 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1452 out of 1452 | elapsed:  6.2min finished
GridSearchCV(cv=KFold(n_splits=12, random_state=42, shuffle=True),
             estimator=ElasticNet(),
             param_grid={'alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,
                                   50, 100],
                         'l1_ratio': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5,
                                      10, 50, 100]},
             verbose=1)
grid_cv.best_score_, grid_cv.best_params_
(0.9253241732844129, {'alpha': 0.001, 'l1_ratio': 0.5})
# Redefining model with above parameter values

elastic_net = ElasticNet(alpha=0.001, l1_ratio=0.5).fit(train,price)
cv_rmse(elastic_net)
0.10783460477814694
"""

'\nkf = KFold(n_splits=12, shuffle=True, random_state=42)\n# Using Cross_validation_Score to get Root mean square error\n\ndef cv_rmse(model):\n    \n    return np.sqrt(-cross_val_score(model, train, price, scoring="neg_mean_squared_error", cv=kf)).mean()\n    \n# Evaluating model with default parameters and check the rmse value\nelastic_net = ElasticNet()\nprint("RMSE with default parameters :",cv_rmse(elastic_net))\nRMSE with default parameters : 0.17519749831464346\n# Lets tune Parameters\n\nalpha = [0.001,0.005,0.01,0.05,0.1,0.5,1,5,10,50,100]\nl1_ratio = [0.001,0.005,0.01,0.05,0.1,0.5,1,5,10,50,100]\n\nparams = {"alpha":alpha,"l1_ratio":l1_ratio}\n# Apply GridSearchCV to get optimal Parameters\n\ngrid_cv = GridSearchCV(estimator=elastic_net,\n                       param_grid=params,\n                       cv=kf,\n                       refit=True,\n                       verbose=1)\n                       \n# Fitting the gridsearchcv model and get the best parameters\n\ngrid_cv.