The goal of your analysis is to identify the most important features of houses that affect the sale prices.

In [1]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
import pandas as pd
import numpy as np
from statsmodels.multivariate.pca import PCA
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Explore data and understand what the fields mean.

In [None]:
print(list(data.columns))

Data cleaning & manipulation. Apply the following techniques as appropriate:

    Adjust skewed data distribution.
    Remove columns with high proportion of missing values.
    Remove records with missing values.
    Feature reduction.
    Convert categorical data to numerical.

Remove/take care of missing values

In [None]:
null_cols = data.isnull().sum().sort_values(ascending=False)
null_cols[null_cols>0]

In [None]:
#drop Alley, PoolQC, Fence, MiscFeature columns, too much missing values
data.drop(columns=['PoolQC','MiscFeature','Alley','Fence'], inplace=True)

In [None]:
#FireplaceQu
data[data['FireplaceQu'].isna()].Fireplaces.sum() #if missing FireplaceQu then it means there is no Fireplace
#putting NF when no fireplace
data.FireplaceQu=data.FireplaceQu.fillna('NF')

In [None]:
#LotFrontage
#data.LotFrontage.value_counts()

#replace missing values by median
data.LotFrontage = data.LotFrontage.fillna(data.LotFrontage.median())

In [None]:
#garage
#data.GarageCond.value_counts()
#data.GarageType.value_counts()
#data.GarageYrBlt.value_counts()
#data.GarageFinish.value_counts()
#data.GarageQual.value_counts()

#replace missing values by NG
data.GarageCond = data.GarageCond.fillna('NG')
data.GarageType = data.GarageType.fillna('NG')
data.GarageYrBlt = data.GarageYrBlt.fillna('NG')
data.GarageFinish = data.GarageFinish.fillna('NG')
data.GarageQual = data.GarageQual.fillna('NG')

In [None]:
#Bsmt
#data.BsmtExposure.value_counts()
#data.BsmtFinType2.value_counts()
#data.BsmtFinType1.value_counts()
#data.BsmtCond.value_counts()
#data.BsmtQual.value_counts()

#replace missing values by NB
data.BsmtExposure = data.BsmtExposure.fillna('NB')
data.BsmtFinType2 = data.BsmtFinType2.fillna('NB')
data.BsmtFinType1 = data.BsmtFinType1.fillna('NB')
data.BsmtCond = data.BsmtCond.fillna('NB')
data.BsmtQual = data.BsmtQual.fillna('NB')

In [None]:
#MasVnr
#data.MasVnrArea.value_counts()
#data.MasVnrType.value_counts()

#replace missing values by median and none
data.MasVnrArea = data.MasVnrArea.fillna(data.MasVnrArea.median())
data.MasVnrType = data.MasVnrType.fillna(data.MasVnrType.mode()[0])

In [None]:
#Electrical
data.Electrical.value_counts()

#replace missing values by mode
data.Electrical = data.Electrical.fillna(data.Electrical.mode()[0])

In [None]:
null_cols = data.isnull().sum().sort_values(ascending=False)
null_cols[null_cols>0]

Outliers

In [8]:
num_data_cols=data._get_numeric_data().columns

In [16]:
for col in num_data_cols:
    #print(sns.boxplot([data[col]]))
    #g=sns.FacetGrid(data, col="col")
    #g.map(sns.boxplot, "tip")

SyntaxError: unexpected EOF while parsing (<ipython-input-16-e3687a64191c>, line 4)

Examine the relationships between the sales price and other features in the dataset. 
Use data visualization techniques to help you gain intuitive understanding of the relationships.

In [None]:
data.SalePrice.describe()

In [None]:
#Price distribution
sns.distplot(data.SalePrice)
plt.show ()

#It seems there is some skewness
stats.skew(data.SalePrice, axis=0, bias=True)

In [None]:
#SalePrice correlation to all variables:
print(data.corr().loc['SalePrice',:].sort_values())

In [None]:
#scatterplot for selected columns (defined in cols)
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars','GarageArea' ,'TotalBsmtSF', 'FullBath', 'YearBuilt','TotRmsAbvGrd']
sns.pairplot(data[cols], height = 2.5)
plt.show()

Make informed guess on which features should be investigated in depth.

In [None]:
'''
OUTLIERS
1stFlrSF: remove value above 4000
TotalBsmtSF: remove value above 4000
GrLivArea: remove value above 4000
'''

data = data[(data['1stFlrSF']<4000) & (data['TotalBsmtSF']<4000) & (data['GrLivArea']<4000)]

We should investigate in depth:

    OverallQual
    GarageCars
    1stFlrSF
    YearBuilt
    FullBath
    GrLivArea

as they are the most correlated to our variable SalesPrice 
and we do not take all highly correlated parameters as they seems to be similar : 
GarageCars/GarageArea; 1stFlrSF/TotalBsmtSF

In [None]:
#Regression
X = sm.add_constant(data[['OverallQual', 'GarageCars', '1stFlrSF', 'YearBuilt', 'FullBath', 'GrLivArea']].values)

Y = data['SalePrice']

model = sm.OLS(Y, X).fit()
predictions = model.predict(X)

print_model = model.summary()
print(print_model)

In [None]:
cols=['1stFlrSF', 'YearBuilt','TotalBsmtSF','GrLivArea','SalePrice']
df=data[cols]
fig,axes = plt.subplots(figsize = (16,8), nrows = 1, ncols = 4)
for i in range(4):
    plt.sca(axes[i])
    plt.scatter(df.iloc[:,i],df['SalePrice'], alpha = 0.8)
    plt.xlabel(df.iloc[:,i].name)
    plt.ylabel('SalePrice')

In [None]:
#Convertion of categorical data
data = pd.get_dummies(data)

No more missing data, we can undertake a standardization

In [None]:
#Standardization
def standardize(x):
    mean = np.mean(x, axis=0)
    std = np.std(x, axis=0)
    return (x - mean) / std
#convert categorical data before standardization
standardized=standardize(data)
standardized

Present your findings in statistical summary and/or data visualizations.

In [None]:
pc = PCA(standardized, ncomp=2)
product=np.dot(standardized.T,pc.factors)

In [None]:
product

In [None]:
df_pca = pd.DataFrame(product, columns=['pc1','pc2'])
plt.scatter(df_pca.pc1,df_pca.pc2)