In [None]:
#invite people for the Kaggle party
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px

%matplotlib inline
#Load datasets for demonstrations
 


In [None]:
house_data = pd.read_csv('houseprices.csv')


In [None]:
print(house_data['SalePrice'].describe())
print(house_data.describe())

In [None]:
# Distribution plot
def distribution_plot(data):
    sns.distplot(data, fit=norm)
    plt.ylabel('Frequency')
    plt.title(f'{data.name} distribution')
    
distribution_plot(house_data['SalePrice'])

In [None]:
#skewness and kurtosis
print("Skewness: %f" % house_data['SalePrice'].skew())
print("Kurtosis: %f" % house_data['SalePrice'].kurt())

In [None]:

print(house_data['OverallQual'].describe())
OverallQual = house_data['OverallQual'].astype('category')

#Peek... head or tail
OverallQual.head()

In [None]:
OverallQual.describe()


In [None]:
column = OverallQual;
print('Column Name:{}\nCardinality:{}\nValues:{}'.format(column.name,column.nunique(), column.unique()))

In [None]:
OverallQual.value_counts().reset_index().rename(columns={'index': 'OverallQual','OverallQual': 'amount'} )


In [None]:
def getPlotsforCatFeature(series,figX=25,figY=17):
    f,ax=plt.subplots(1,2,figsize=(figX,figY))
    series.value_counts().plot.pie(autopct='%1.6f%%',ax=ax[0])
    ax[0].set_title(f'{series.name}')
    ax[0].set_ylabel('')
    sns.countplot(series,ax=ax[1])
    ax[1].set_title(f'Count plot for {series.name}')
    plt.show()
    
getPlotsforCatFeature(OverallQual,25,20)

In [None]:
#scatter plot
house_data.plot.scatter(x='GrLivArea', y='SalePrice');

''' Alternatively you could use following function 
def scatterplot(seriesX,seriesY):
    data = pd.concat([seriesY, seriesX], axis=1)
    data.plot.scatter(x=seriesX.name, y=seriesY.name)
    
scatterplot(house_data['GrLivArea'],house_data['SalePrice'])
'''

In [None]:
#Box plot
num = 'SalePrice'
cat = 'OverallQual'
df  =  house_data

data = pd.concat([df[num], df[cat]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=cat, y=num, data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
def boxplot(x, y, **kwargs):
    sns.boxplot(x=x, y=y)
    x=plt.xticks(rotation=90)

def fillMissingCatColumns(data,categorical):
    for c in categorical:
        data[c] = data[c].astype('category')
        if data[c].isnull().any():
            data[c] = data[c].cat.add_categories(['MISSING'])
            data[c] = data[c].fillna('MISSING')
    
def getboxPlots(data,var,categorical):
    fillMissingCatColumns(data,categorical)
    f = pd.melt(data, id_vars=var, value_vars=categorical)
    g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=15)
    g = g.map(boxplot, "value", var)
    

data = house_data.copy()
categorical = [f for f in data.columns if data.dtypes[f] == 'object']    
getboxPlots(data,'SalePrice',categorical)

In [None]:
def getCorrHeatMap(dataFrame,figSize=[12,9]):
    corrmat = dataFrame.corr()
    print(dataFrame.corr()['SalePrice'].sort_values(ascending=False).head(10)[dataFrame.corr()['SalePrice'].sort_values(ascending=False).head(10)>0.5])
    f, ax = plt.subplots(figsize=(figSize[0], figSize[1]))
    sns.heatmap(corrmat, vmax=.8, square=True);

getCorrHeatMap(house_data)

In [None]:
def getZoomedCorrHeatMap(dataFrame,featureCount,target,figSize=[12,9]):
    corrmat = dataFrame.corr()
    cols = corrmat.nlargest(featureCount, target)[target].index
    f , ax = plt.subplots(figsize = (figSize[0],figSize[1]))
    cm = np.corrcoef(dataFrame[cols].values.T)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
    plt.show()

getZoomedCorrHeatMap(house_data,10,'SalePrice',[10,8])

In [None]:
def getMissingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    temp = pd.concat([total, percent], axis = 1,keys= ['Total Missing Count', '% of Total Observations'])
    temp.index.name ='Feature Name'
    return temp.loc[(temp['Total Missing Count'] > 0)]

getMissingValuesInfo(house_data)

In [None]:
# Visualizing missing counts
missing = house_data.isnull().sum()
# print('missing',missing)
missing = missing[missing > 0]
# print('missing',missing)
missing.sort_values(inplace=True)
print('missing',missing)
plt.subplots(figsize=(15,5))
missing.plot.bar()
plt.show()

In [None]:
missing.reset_index()['index'].T;
house_data[missing.reset_index()['index'].T]

In [None]:
fig, ax = plt.subplots(figsize=(25,9))

sns.heatmap(house_data[missing.reset_index()['index'].T].isnull(), cbar=False, cmap="YlGnBu_r")

plt.show()
# White spaces shows the missing value in the data frame.

In [None]:
def distplots(data,num_features):
    f = pd.melt(data, value_vars=num_features)
    print(f)
    g = sns.FacetGrid(f, col="variable",  col_wrap=4, sharex=False, sharey=False)
    g = g.map(sns.distplot, "value")
    

num_features = house_data.select_dtypes(include=['int64','float64'])
distplots(house_data,num_features)

In [None]:
num_features = house_data.select_dtypes(include=['int64','float64'])
num_features.describe() 

In [None]:
categorical_features = house_data.select_dtypes(include='object')
categorical_features.describe()

In [None]:
# Listing unique values in categorical columns^
def printUniqueValues(df,cardinality=1000):
    n = df.select_dtypes(include=object)
    for column in n.columns:
        uCount = df[column].nunique()
        if uCount<=cardinality:
            print('{:>12}: {} {}'.format(column,uCount, df[column].unique()))
            #print(column,': [',uCount , '] ', df[column].unique())


printUniqueValues(house_data,10)


In [None]:
import pandas_profiling
profile_report = pandas_profiling.ProfileReport(house_data)
#profile_report.to_file("profile_report.html")
profile_report

In [None]:
# We can use pandas profiling on selected features too.

# Using Pandas Profiling to analyse SalePrice feature in housing dataset.

series = house_data['SalePrice']
d = { series.name : series}
df = pd.DataFrame(d) 
pandas_profiling.ProfileReport(df)