<a href="https://colab.research.google.com/github/greatkay-olowo/boston-housing-eda/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
#upload data
df = pd.read_csv('hou_all.csv')

"""
CRIM: Per capita crime rate by town
ZN: Proportion of residential land zoned for lots over 25,000 sq. ft
INDUS: Proportion of non-retail business acres per town
CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX: Nitric oxide concentration (parts per 10 million)
RM: Average number of rooms per dwelling
AGE: Proportion of owner-occupied units built prior to 1940
DIS: Weighted distances to five Boston employment centers
RAD: Index of accessibility to radial highways
TAX: Full-value property tax rate per $10,000
PTRATIO: Pupil-teacher ratio by town
B: 1000(Bk — 0.63)², where Bk is the proportion of [people of African American descent] by town
LSTAT: Percentage of lower status of the population
MEDV: Median value of owner-occupied homes in $1000s
"""

In [None]:
#check the first 10 data rows
df.head(10)

In [None]:
df.info()

In [None]:
df = df.drop('Unnamed: 14',axis=1)
df.info()

In [None]:
#check a sample (20 size) of the data set
# many 0 at zn and chas
df.sample(20)

In [None]:
# replace zn and chas 0 with NaN to be able to calculate % of which is NaN
df.zn.replace(0,np.nan,inplace = True)
df.chas.replace(0,np.nan,inplace = True) 
df.info()

In [None]:
# check percentage of cloumns that are null
df.isnull().sum()/len(df) * 100

# both zn and chas has high ratio of NaN

In [None]:
# drop zn and chas
df = df.drop(['zn','chas'],axis=1)
df.info()

In [None]:
# show the distribution of MEDV
sns.distplot(df['medv'], bins=30)

In [None]:
# remove the outlier
df = df[~(df['medv'] >= 50.0)]
df.info()

In [None]:
# show the distribution of MEDV after removing 50 which is an outlier
sns.distplot(df['medv'], bins=30)

In [None]:
  #get a list of outliers
  for k, v in df.items():
        q1 = v.quantile(0.25)
        q3 = v.quantile(0.75)
        irq = q3 - q1
        v_col = v[(v <= q1 - 1.5 * irq) | (v >= q3 + 1.5 * irq)]
        perc = np.shape(v_col)[0] * 100.0 / np.shape(df)[0]
        print("Column %s outliers = %.2f%%" % (k, perc))

In [None]:
#get a graph of outliers
fig, axs = plt.subplots(ncols=6, nrows=2, figsize=(10, 10))
index = 0
axs = axs.flatten()
for k,v in df.items():
    sns.boxplot(y=k, data=df, ax=axs[index])
    index += 1
plt.tight_layout(pad=0.5, w_pad=5, h_pad=5.0)

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr().round(2), annot=True)

In [None]:
sns.pairplot(data=df,
                  y_vars=['medv'],
                  x_vars=['indus', 'nox', 'rm', 'age', 'dis','rad','tax', 'ptratio','black','lstat'], hue = 'medv')
# There are strong correlations between medv and rm with lstat

In [None]:
# drop other columns not showing strong correlations
df.drop(['indus', 'nox', 'age', 'dis','rad','tax', 'ptratio','black'], axis=1)

In [None]:
# view relationship plot of the new data
sns.relplot(x = 'rm', y = 'lstat', hue = 'medv', data=df)