In [1]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

sns.set_style('darkgrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [4]:
boston_file = '../datasets/housing.csv'

In [5]:
df = pd.read_csv(boston_file,index_col=None)
df.head(5)

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0,0.00632,18.0,2.31,0,0.538,6.575,65.2,409,1,296,15.3,396.9,4.98,24.0
1,1,0.02731,0.0,7.07,0,0.469,6.421,78.9,49671,2,242,17.8,396.9,9.14,21.6
2,2,0.02729,0.0,7.07,0,0.469,7.185,61.1,49671,2,242,17.8,392.83,4.03,34.7
3,3,0.03237,0.0,2.18,0,0.458,6.998,45.8,60622,3,222,18.7,394.63,2.94,33.4
4,4,0.06905,0.0,2.18,0,0.458,7.147,54.2,60622,3,222,18.7,396.9,5.33,36.2


In [None]:
###### Since the original file had an extra column, I deleted it and saved it back
#df.drop('Unnamed: 0',axis=1, inplace=True)
#df.head(2)
#df.to_csv(boston_file,index=False)

In [None]:
df.dtypes

In [None]:
df.DIS[0]

In [None]:
df.DIS = df.DIS.map(lambda x : float('.'.join(x.split(','))))
df.head()

In [None]:
df.RAD.unique()

In [None]:
df.RAD = df.RAD.map(lambda x: np.nan if x == '?' else float(x))
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.rename(columns={
        'CRIM':'rate_of_crime',
        'ZN':'residential_zone_pct',
        'INDUS':'business_zone_pct',
        'CHAS':'borders_river',
        'NOX':'oxide_concentration',
        'RM':'average_rooms',
        'AGE':'owner_occup_pct',
        'DIS':'dist_to_work',
        'RAD':'access_to_highway',
        'TAX':'property_tax',
        'PTRATIO':'student_teacher_ratio',
        'B':'black_stat',
        'LSTAT':'pct_underclass',
        'MEDV':'home_median_value'
    }, inplace=True)

In [None]:
df.head(2)

In [None]:
df.describe()

In [None]:
# rate of crime
fig = plt.figure(figsize=(6,4))
ax = fig.gca()

ax = sns.boxplot(df.rate_of_crime, orient='v',
                fliersize=8, linewidth=1.5, notch=True,
                saturation=0.5, ax=ax)

ax.set_ylabel('rate_of_crime', fontsize=16)
ax.set_title('Rate of crime boxplot\n', fontsize=20)

plt.show()

In [None]:
data = df.rate_of_crime
####  You don't need Seaborn. A simple boxplot with two "0" will do
####  plt.boxplot(data, 0, 'gD') ### this will show outliers in Green
####  plt.boxplot(data, 0, '')  #### This won't show outliers
####  plt.boxplot(data, 0, 'rs', 0)  #### this will show outliers horizontally
plt.figure()
plt.boxplot(data, 0, 'rs', 1)
plt.title(df.rate_of_crime.name)

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(df)

In [None]:
###### Plot all the variables in the dataset using a simple Loop to show Outliers in Red###
import pdb
numb = len(df.columns)
fig, ax = plt.subplots(figsize=(numb,numb))
for i in range(numb):
    ax = fig.add_subplot(2,numb/2,i+1)
    ax.boxplot(df[df.columns[i]], 0, 'rs', 0)  #### this will show outliers horizontally
    ax.set_title(df.columns[i])

In [None]:
##### Standardize all the variables in One step. But be careful !
####   All the variables must be numeric for this to work !!
df_norm = (df - df.mean()) / df.std()

In [None]:
fig = plt.figure(figsize=(numb*2,numb*2))
ax = fig.gca()

ax = sns.violinplot(data=df_norm, orient='h', fliersize=5, scale='width',
                 linewidth=3, notch=False, saturation=0.5, ax=ax, inner='box')
plt.show()

In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(corr, annot=True)