# Setup

## Import data

Import pandas and import test and train data in their own dataframes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

test = pd.read_csv("../input/test.csv")
train = pd.read_csv("../input/train.csv")

print("Dimensions of train: {}".format(train.shape))
print("Dimensions of test: {}".format(test.shape))

In [None]:
test.head()

## Explore the data

In [None]:
plt.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
prices['price'].hist()
prices['price'].head()

In [None]:
list(train)

### Select some features that intuitively seem like they will have predictive power for house sale prices

SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.  
Feature variables:

* MSSubClass: The building class
* Neighborhood: Physical locations within Ames city limits
* OverallQual: Overall material and finish quality
* OverallCond: Overall condition rating
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* KitchenQual: Kitchen quality
* SaleType: Type of sale
* SaleCondition: Condition of sale
* GrLivArea

In [None]:
columns = ['SalePrice','MSSubClass', 'Neighborhood', 'OverallQual', 'OverallCond', '1stFlrSF',
       '2ndFlrSF','KitchenQual',
       'SaleType', 'SaleCondition','GrLivArea']

train[columns].head(10)

In [None]:
import matplotlib.pyplot as plt

class_pivot = pd.pivot_table(train[columns],index="MSSubClass",values="SalePrice",aggfunc=[np.mean])
class_pivot.plot.bar()
plt.show()

In [None]:
class_pivot = pd.pivot_table(train[columns],index="1stFlrSF",values="SalePrice",aggfunc=[np.mean])
class_pivot.plot.bar()
plt.show()

In [None]:
class_pivot = pd.pivot_table(train[columns],index=["OverallQual"],values="SalePrice",aggfunc=[np.mean])
class_pivot.plot.bar()
plt.show()

## Identify Categorical and Numeric features  
The following features are categorical  
* MSSubClass: The building class
* Neighborhood: Physical locations within Ames city limits
* KitchenQual: Kitchen quality
* SaleType: Type of sale
* SaleCondition: Condition of sale  

The following features are numeric  
* OverallQual: Overall material and finish quality
* OverallCond: Overall condition rating
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet

### Convert the categorical variables into integer data through get_dummies method



In [None]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

train = create_dummies(train,"MSSubClass")
test = create_dummies(test,"MSSubClass")
train = create_dummies(train,"Neighborhood")
test = create_dummies(test,"Neighborhood")
train = create_dummies(train,"KitchenQual")
test = create_dummies(test,"KitchenQual")
train = create_dummies(train,"SaleType")
test = create_dummies(test,"SaleType")
train = create_dummies(train,"SaleCondition")
test = create_dummies(test,"SaleCondition")

list(train)

## Prepare data for modelling

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer
import seaborn as sns

In [None]:
columns = ['OverallQual', 'OverallCond', '1stFlrSF',
       '2ndFlrSF','KitchenQual_Ex','KitchenQual_Fa','KitchenQual_Gd','KitchenQual_TA']

holdout = test[columns] # from now on we will refer to this
               # dataframe as the holdout data
holdout.head()
    
from sklearn.model_selection import train_test_split

all_X = train[columns]
all_y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.20,random_state=0)

In [None]:
all_X.head()

In [None]:
all_y.head()

In [None]:
X_train.head(10)

In [None]:
X_test.head(10)

In [None]:
y_train.head(10)

In [None]:
y_test.head(10)

### Linear model without Regularization

In [None]:
n_folds = 5
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)
def rmse_CV_test(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)
test_pre = lr.predict(X_test)
train_pre = lr.predict(X_train)
print('rmse on train',rmse_CV_train(lr).mean())
print('rmse on test',rmse_CV_test(lr).mean())

## Visualising our models outputs

In [None]:
#plot between predicted values and residuals
plt.scatter(train_pre, train_pre - y_train, c = "blue",  label = "Training data")
plt.scatter(test_pre,test_pre - y_test, c = "black",  label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

In [None]:
# Plot predictions - Real values
plt.scatter(train_pre, y_train, c = "blue",  label = "Training data")
plt.scatter(test_pre, y_test, c = "black",  label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

In [None]:
columns = ['OverallQual', 'OverallCond', '1stFlrSF',
       '2ndFlrSF','KitchenQual_Ex','KitchenQual_Fa','KitchenQual_Gd','KitchenQual_TA']

holdout = test[columns] # from now on we will refer to this
               # dataframe as the holdout data

all_X = train[columns]
all_y = train['SalePrice']

In [None]:
print(type(all_X))
print(type(all_y))
print(type(holdout))

In [None]:
all_y.head()

In [None]:
holdout.head()

# Applying Linear Regression to holdout

In [None]:
lr = LinearRegression()
lr.fit(all_X,all_y)
holdout_predictions = pd.DataFrame(lr.predict(holdout), columns=['SalePrice'])
holdout_predictions['Id'] = test['Id']

In [None]:
holdout_predictions[['Id', 'SalePrice']].head()

## Checking for SalePrice values less than 0

In [None]:
holdout_predictions[['Id', 'SalePrice']][holdout_predictions['SalePrice'] < 0]

## Set an SalePrice values less than 0 to a default value of 500

In [None]:
holdout_predictions.loc[holdout_predictions['SalePrice'] < 0, 'SalePrice'] = 500
holdout_predictions[['Id', 'SalePrice']][holdout_predictions['SalePrice'] < 0]

In [None]:
holdout_predictions[['Id', 'SalePrice']].to_csv("houseprices_submission.csv", index=False)