In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## 1. Understand the dataset:

In [None]:
df = pd.read_csv('./Dataset/PEP1.csv', index_col='Id')
df.head()

### Printing column names

In [None]:
df.columns

### Identify the shape of the dataset

In [None]:
df.shape

### check for null values

In [None]:
mask = list(df.isna().any())
columns_with_null = df.columns[mask]
columns_with_null.sort_values()

In [None]:
print('total number of null values:', df.isna().sum().sum(), '\n')

missing_number = df[columns_with_null].isna().sum().sort_values(ascending=False)
missing_percentage = missing_number / len(df)
missing_info = pd.concat([missing_number, missing_percentage], axis=1, keys=['missing number', 'missing percentage'])
missing_info


##### the colums PoolQC, MiscFeature, Alley, Fence and FireplaceQu have significant data missing (%)

### Identify variables with unique values

In [None]:
for col in df.columns:
    print(col, ":", df[col].unique())

## Select the numerical and categorical variables

In [None]:
numerical_col = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_col = df.select_dtypes(include='object').columns.tolist()

print('Numerical: \n' , numerical_col, '\n')
print('Categorical \n:', categorical_col, '\n')
print('# of numberical columns', len(numerical_col))
print('# of catergorical columns', len(categorical_col))

## 3. EDA numerical variables

In [None]:
df_numerical = df[numerical_col]

# 3a. Missing value treatment

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputed_numerical = pd.DataFrame(imputer.fit_transform(df_numerical))

imputed_numerical.columns = df_numerical.columns

print(imputed_numerical.head())


# 3b Identify the skewness and distribution

In [None]:
# skewness of entire dataset
df_numerical.skew(axis = 0, skipna=True)

In [None]:
# skewness of SalePrice
print("Skewness: %f" % df['SalePrice'].skew())

In [None]:
df['SalePrice']

In [None]:
sns.pairplot(df, diag_kind='kde')
plt.show()

In [None]:
from scipy.stats import skew
print(skew(imputed_numerical, bias=False))

In [None]:
sns.displot(data=imputed_numerical, x='SalePrice', kind="kde", aspect=1.4, log_scale=10 )

In [None]:
from scipy.stats import kurtosis
print(kurtosis(imputed_numerical))

# 3c Identify significant variables using a correlation matrix 

In [None]:
numerical_corr = imputed_numerical.corr()
numerical_corr.style.background_gradient(cmap='coolwarm')
# print(numerical_corr)

In [None]:
salePrice_corr_feat = set()
salePrice_corr = numerical_corr['SalePrice']

for i in range(len(salePrice_corr)):
    if (numerical_corr.columns[i] != 'SalePrice'):
      if abs(salePrice_corr[i]) >= 0.5:
         colname = numerical_corr.columns[i]
         salePrice_corr_feat.add(colname)

print('Variables highly correlated with SalePrice:, \n', salePrice_corr_feat)


# 3d. Pair plot for distribution and density

In [None]:
import seaborn as sns
cols = ['FullBath', 'YearRemodAdd', 'GarageArea', '1stFlrSF', 'GrLivArea', 'TotRmsAbvGrd', 'OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GarageCars']
significant_numerical_var = imputed_numerical[cols]
sns.pairplot(df, vars=cols, diag_kind='kde')
plt.show()

# 4 EDA of Categorical Variables

In [None]:
df_categorical = df[categorical_col]
df_categorical.columns.sort_values()

# A. Missing value treatment

In [None]:
mask = list(df_categorical.isna().any())
categorical_with_null = df_categorical.columns[mask]
categorical_with_null.sort_values()

In [None]:
print('total number of categorical null values:', df_categorical.isna().sum().sum(), '\n')


missing_number_categorical = df[categorical_with_null].isna().sum().sort_values(ascending=False)
missing_percentage_categorical = (missing_number_categorical / len(df)).round(4) * 100
missing_info_categorical = pd.concat([missing_number_categorical, missing_percentage_categorical], axis=1, keys=['missing number', 'missing percentage'])
missing_info_categorical

### dropping variables with large percentage of missing values

In [None]:
df_categorical = df_categorical.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1)
df_categorical.isna().sum().sum()

In [None]:
# replace missing categorical values variables with the mode of each column

df_categorical = df_categorical.fillna(df_categorical.mode().iloc[0])
df_categorical.head()

In [None]:
# adding saleprice to df_categorical
df_categorical['SalePrice'] = df.loc[df_categorical.index, 'SalePrice'].copy()
df_categorical.head()

# Count plot for bivariate analysis

In [None]:
for i in range(0, len(df_categorical.columns), 2):
    if df_categorical.columns[i] != 'SalePrice':
      plt.figure(figsize=(10, 4))
      plt.subplot(121)
      plt.xticks(rotation=90)
      sns.barplot(y='SalePrice', x=df_categorical.columns[i], data=df_categorical)
      plt.subplot(122)
      plt.xticks(rotation=90)
      sns.barplot(y='SalePrice', x=df_categorical.columns[i + 1], data=df_categorical)
      plt.tight_layout()
      plt.show()

# 4C Identify significant variables using p-values and Chi-Square values


In [None]:
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

class ChiSquare:
  def __init__(self, dataframe):
      self.df = dataframe
      self.p = None #P-Value
      self.chi2 = None #Chi-square Test Statistic
      self.dof = None
      self.important_var = []
      self.unimportant_var = []
      
      self.dfObserved = None
      self.dfExpected = None

  #Function to print the results of p-value and chi-square test     
  def _select_significant_var(self, colX, alpha):
      if self.p < alpha:
          self.important_var.append(colX)
      else:
          self.unimportant_var.append(colX)
  #Function to determine chi-square and p-value less than or equal to 0.05
  def TestIndependence(self,colX,colY, alpha=0.05):
      X = self.df[colX].astype(str)
      Y = self.df[colY].astype(str)
      
      self.dfObserved = pd.crosstab(Y,X) 
      chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
      self.p = p
      self.chi2 = chi2
      self.dof = dof 
      
      self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
      
      self._select_significant_var(colX, alpha)


#Initializing ChiSquare Class
cT = ChiSquare(df_categorical)

#Perform Feature Selection

for var in df_categorical.columns:
    cT.TestIndependence(colX=var,colY="SalePrice") 

print(cT.important_var)

In [None]:
# Keep significant variables
df_categorical = df_categorical[['MSZoning', 'Street', 'LotShape', 'LotConfig', 'Neighborhood', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'Heating', 'CentralAir', 'KitchenQual', 'GarageFinish', 'SaleType', 'SaleCondition', 'SalePrice']]
df_categorical.head(2)

In [None]:
# We will select categorical variables lower than 0.5
selected_significant_var = ['Neighborhood', 'BldgType', 'HeatingQC', 'GarageType', 'LandSlope', 'HouseStyle', 'Exterior2nd', 'LotShape', 'BsmtFinType1', 'Exterior1st']
significant_categorical_var = df_categorical[selected_significant_var]
significant_categorical_var

# 5 Combine all the significant categorical and numerical variables


In [None]:
final = df_categorical.join(significant_numerical_var)
final

In [None]:
fig, axs = plt.subplots(ncols=3, nrows=6, figsize=(35, 45))
index=0
axs = axs.flatten()

for item in df_categorical.columns:
    if (item != 'SalePrice'):
      sns.boxplot(y='SalePrice', x=item, data=df_categorical, ax=axs[index])
      ax=axs[index]
      index += 1 
       
plt.show()