In [1]:
%matplotlib notebook

In [2]:
# Import dependencies
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from scipy.stats import sem

In [3]:
# Import the California housing data set and get description
california_dataset = fetch_california_housing()

print(california_dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [4]:
# Read California housing data into a Pandas dataframe
housing_data = pd.DataFrame(data=california_dataset.data,columns=california_dataset.feature_names)
housing_data['MEDV'] = california_dataset.target
housing_data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MEDV
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:
# Create a sample set of 10, each with 30 vehicles
samples = [housing_data.sample(20) for x in range(0,25)]
# Generate the plot data for each sample 
means = [sample['MEDV'].mean() for sample in samples]
standard_errors = [sem(sample['MEDV']) for sample in samples]
x_axis = np.arange(0, len(samples), 1) + 1
# Setting up the plot
fig, ax = plt.subplots()
ax.errorbar(x_axis, means, standard_errors, fmt="o")
ax.set_xlim(0, len(samples) + 1)
# ax.set_ylim(20,28)
ax.set_xlabel("Sample Number")
ax.set_ylabel("MEDV")
plt.show()

<IPython.core.display.Javascript object>

In [9]:
# Create a bunch of samples, each with sample size of 20
nsamples = 25
div = 20
samples = [housing_data.sample(div) for x in range(0,nsamples)]

In [10]:
# Calculate means
means = [s['MEDV'].mean() for s in samples]
# Calculate standard error on means
sems = [sem(s['MEDV']) for s in samples]

In [16]:
sems

[0.2729848453770156,
 0.22838438829260957,
 0.19478186990093627,
 0.1575593538956034,
 0.23390848525370816,
 0.20619849245199073,
 0.22283556452189013,
 0.24064318940136897,
 0.20617074681463726,
 0.30175041119361073,
 0.2740658224006502,
 0.2566024860254875,
 0.19362286015262817,
 0.24277663335978844,
 0.2331598897978087,
 0.2441269005100249,
 0.3179850057213886,
 0.22970610828667687,
 0.2560098275616479,
 0.2015583743967828,
 0.26217621509808603,
 0.27622399368940354,
 0.27643346781675576,
 0.3214228445856428,
 0.227053099226637]

In [11]:
# Plot sample means with error bars
fig, ax = plt.subplots()
ax.errorbar(np.arange(0, len(samples), 1)+1,means, yerr=sems, fmt="o", color="b",
            alpha=0.5, label="Mean of House Prices")
ax.set_xlim(0, len(means)+1)
ax.set_xlabel("Sample Number")
ax.set_ylabel("Mean of Median House Prices ($100,000)")
plt.legend(loc="best", fontsize="small", fancybox=True)
plt.show()

<IPython.core.display.Javascript object>

In [12]:
# Calculate the range of SEM values
print(f"The range of SEM values in the sample set is {max(sems)-min(sems)}")

The range of SEM values in the sample set is 0.1638634906900394


In [13]:
# Determine which sample's mean is closest to the population mean
print(f"The smallest SEM observed was {min(sems)}")
samp_index = sems.index(min(sems))
print(f"The sample with the smallest SEM is sample {samp_index+1}")

The smallest SEM observed was 0.1575593538956034
The sample with the smallest SEM is sample 4


In [10]:
# Compare to the population mean
print(f"The mean of the sample 5 is {samples[samp_index]['MEDV'].mean()}")
print(f"The mean of the population data set is {housing_data['MEDV'].mean()}")

The mean of the sample 5 is 1.5549000000000004
The mean of the population data set is 2.068558169089147
