In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
style.use('ggplot')
%matplotlib inline

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
train.info(), train.shape, test.shape

## NANs

In [None]:
# NANs
total = train.isnull().sum().sort_values(ascending=False)
percent = train.isnull().sum().sort_values(ascending=False)
train.isnull().count().sort_values(ascending=False)
missing = pd.concat([total, percent], axis=1, keys=['Missing', 'Percent'])
missing.head(20)

 When more than 15% of the data is missing, we should delete the corresponding variable and pretend it never existed.
Also, droping other missing data columns except *Electrical* where deleting the observation with missing data will suffice.

In [None]:
train = train.drop((missing[missing['Missing'] > 1]).index, 1)
train = train.drop(train.loc[train['Electrical'].isnull()].index)
# check if any NANs remaining
train.isnull().sum().max()

In [None]:
mis = missing[missing['Missing'] > 1]
mis

In [None]:
# droping the columns in test set that were droped in training set
for i in test.columns:    
    if i in mis.index:
        test.drop(i, 1, inplace=True)

In [None]:
# remaining missing values in test set
test.isnull().sum().sort_values(ascending=False)[:15]

In [None]:
# remove missing values from test set
miss = test.isnull().sum().sort_values(ascending=False)[:15].index
for i in miss:
    if test[i].dtype != 'object':
        test[i].fillna(test[i].median(), inplace=True)
    else:  # if test[i].dtype == 'object'
        test[i].fillna(test[i].mode()[0], inplace=True)

## Outliers

In [None]:
fig, ax = plt.subplots()
plt.scatter(train['GrLivArea'], train['SalePrice'])
plt.ylabel('SalePrice')
plt.xlabel('GrLivArea')

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
ax1.set_title('With Outliers')
sns.regplot(data=train, x='GrLivArea',y='SalePrice', ax=ax1)
ax2.set_title('Without Outliers')
sns.regplot(data=train[train.GrLivArea < 4500], x='GrLivArea',y='SalePrice', ax=ax2)

This is a remarkable change, as simply removing two points has resulted in a visible shift in the curve, despite the effect of the remaining data. Furthermore, the confidence interval of our plots has improved massively

In [None]:
bonf_outlier = [88,462,523,588,632,968,1298,1324]
train = train.drop(bonf_outlier)

## Our response:  SalePrice


In [None]:
sns.distplot(train['SalePrice'])

The target variable is right skewed.

In [None]:
print('Skewness: ', train['SalePrice'].skew())

### Correlation matrix

In [None]:
f, ax = plt.subplots(figsize=(12, 9)) 
sns.heatmap(train.corr(), cmap="YlGnBu")

The variables `TotalBsmtSF` and `IstFlrSF` are higly correlated, so are the variables `GarageArea` and `GarageCar`. They may give rise to the problem of multicollinearity.

In [None]:
c = train.corr()
c['SalePrice'].sort_values(ascending=False)[:10]

In [None]:
# convert categorical variable into dummy
train = pd.get_dummies(train)
test = pd.get_dummies(test)
# Ensure the test data is encoded in the same manner as the training data
final_train, final_test = train.align(test, join='inner', axis=1)  # inner join

In [None]:
# X_train = final_train.drop('SalePrice', axis=1)
# y_train = final_train['SalePrice']
X_train = final_train.drop('Id', axis=1)
y_train = train['SalePrice']
X_test = final_test.drop('Id', axis=1)

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, RidgeCV

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
linreg.score(X_train, y_train)

In [None]:
linrid = Ridge(alpha=20.0)
linrid.fit(X_train, y_train)
linrid.score(X_train, y_train)

In [None]:
rid = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
rid.fit(X_train, y_train)
alpha = rid.alpha_
print('best alpha',alpha)
rid.score(X_train, y_train)

In [None]:
linlasso = Lasso(alpha=2.0, max_iter=10000)
linlasso.fit(X_train, y_train)
y_pred = linlasso.predict(X_test)
linlasso.score(X_train, y_train)

In [None]:
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': y_pred
})
submission.to_csv('house.csv', index=False)