In [1]:
boston_df = read.csv('/home/jovyan/Project3/data/bostonhousing.csv', header=FALSE, sep='')

colnames(boston_df) = c('crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black','lstat', 'medv')

In [4]:
library(dplyr, warn.conflicts = FALSE)
library(moments)

In [18]:
stats = data.frame(feature=colnames(boston_df))
stats['mean_'] = sapply(boston_df, mean)
stats['sd_'] = sapply(boston_df, sd)
stats['skewness_'] = sapply(boston_df, skewness)
stats['kurtosis_'] = sapply(boston_df, kurtosis)
stats

feature,mean_,sd_,skewness_,kurtosis_
crim,3.61352356,8.6015451,5.2076524,39.752786
zn,11.36363636,23.322453,2.2190631,6.979949
indus,11.13677866,6.8603529,0.2941463,1.766782
chas,0.06916996,0.253994,3.3957993,12.531453
nox,0.55469506,0.1158777,0.7271442,2.924136
rm,6.28463439,0.7026171,0.4024147,4.861027
age,68.57490119,28.1488614,-0.5971856,2.029986
dis,3.79504269,2.1057101,1.0087788,3.471299
rad,9.54940711,8.7072594,1.0018335,2.129479
tax,408.23715415,168.5371161,0.6679683,1.857015


# MANY OF THE TOOLS WE WILL USE WILL ASSUME NORMAL DATA

In [19]:
library(ggplot2)

In [20]:
library(repr)
options(repr.plot.width=10, repr.plot.height=2) 

In [21]:
#source('multiplot.r')

In [25]:
original_v_scaled <- function (feature) {
    original_feature <- as.vector(boston_df[[feature]])
    scaled_feature <- as.vector(boston_sc_df[[feature]])
    p1 <- qplot(original_feature, geom="histogram", bins=200, alpha=I(.4),)+
        geom_vline(aes(xintercept=mean(original_feature)), color="red", linetype="dashed", size=1)+
        geom_vline(aes(xintercept=median(original_feature)), color="blue", linetype="dashed", size=1)+
        ggtitle(paste("Original Distribution of", feature))
    p2 <- qplot(scaled_feature, geom="histogram", bins=200, alpha=I(.4),)+
        geom_vline(aes(xintercept=mean(scaled_feature)), color="red", linetype="dashed", size=1)+
        geom_vline(aes(xintercept=median(scaled_feature)), color="blue", linetype="dashed", size=1)  +
        ggtitle(paste("Scaled Distribution of", feature))  
    multiplot(p1, p2, cols=2)
}


In [26]:
original_v_scaled('crim')
original_v_scaled('zn')
original_v_scaled('indus')
original_v_scaled('chas')
original_v_scaled('nox')
original_v_scaled('rm')
original_v_scaled('age')
original_v_scaled('dis')
original_v_scaled('rad')
original_v_scaled('tax')
original_v_scaled('ptratio')
original_v_scaled('black')
original_v_scaled('lstat')
original_v_scaled('medv')

ERROR: Error in original_v_scaled("crim"): could not find function "multiplot"


Another way we can verify this is via a test of skewness.

To perform this test we can use the scipy.stats.skewtest.

This function tests the null hypothesis that the skewness of the population that the sample was drawn from is the same as that of a corresponding normal distribution. Remember, a low p-value means reject the null hypothesis i.e the data is skewed!

In [24]:
skewness_ = data.frame(feature=colnames(boston_sc_df))
skewness_['skew_'] = sapply(boston_df, skewness)
skewness_['skew_sc'] = sapply(boston_sc_df, skewness)
skewness_

ERROR: Error in is.data.frame(x): object 'boston_sc_df' not found


## Deskew by taking the log of the data

In [None]:
boston_log_df = log(boston_df)
boston_log_sc_df = data.frame(scale(boston_log_df))

In [None]:
skewness_['skew_log_'] = sapply(boston_log_df, skewness)
skewness_['skew_log_sc'] = sapply(boston_log_sc_df, skewness)
skewness_

In [None]:
original_v_scaled_v_log <- function (feature) {
    original_feature <- as.vector(boston_df[[feature]])
    scaled_feature <- as.vector(boston_sc_df[[feature]])
    log_feature <- as.vector(boston_log_df[[feature]])
    log_scaled_feature <- as.vector(boston_log_sc_df[[feature]])
    p1 <- qplot(original_feature, geom="histogram", bins=200, alpha=I(.4),)+
        geom_vline(aes(xintercept=mean(original_feature)), color="red", linetype="dashed", size=1)+
        geom_vline(aes(xintercept=median(original_feature)), color="blue", linetype="dashed", size=1)+
        ggtitle(paste("Original Distribution of", feature))
    p2 <- qplot(scaled_feature, geom="histogram", bins=200, alpha=I(.4),)+
        geom_vline(aes(xintercept=mean(scaled_feature)), color="red", linetype="dashed", size=1)+
        geom_vline(aes(xintercept=median(scaled_feature)), color="blue", linetype="dashed", size=1)  +
        ggtitle(paste("Scaled Distribution of", feature))  
    p3 <- qplot(log_feature, geom="histogram", bins=200, alpha=I(.4),)+
        geom_vline(aes(xintercept=mean(log_feature)), color="red", linetype="dashed", size=1)+
        geom_vline(aes(xintercept=median(log_feature)), color="blue", linetype="dashed", size=1)+
        ggtitle(paste("Log Distribution of", feature))
    p4 <- qplot(log_scaled_feature, geom="histogram", bins=200, alpha=I(.4),)+
        geom_vline(aes(xintercept=mean(log_scaled_feature)), color="red", linetype="dashed", size=1)+
        geom_vline(aes(xintercept=median(log_scaled_feature)), color="blue", linetype="dashed", size=1)  +
        ggtitle(paste("Log Scaled Distribution of", feature))  
    multiplot(p1, p2, p3, p4, cols=4)
}


In [3]:
original_v_scaled_v_log('crim')
original_v_scaled_v_log('zn')
original_v_scaled_v_log('indus')
original_v_scaled_v_log('chas')
original_v_scaled_v_log('nox')
original_v_scaled_v_log('rm')
original_v_scaled_v_log('age')
original_v_scaled_v_log('dis')
original_v_scaled_v_log('rad')
original_v_scaled_v_log('tax')
original_v_scaled_v_log('ptratio')
original_v_scaled_v_log('black')
original_v_scaled_v_log('lstat')
original_v_scaled_v_log('medv')

ERROR: Error in eval(expr, envir, enclos): could not find function "original_v_scaled_v_log"
