# 2. Statistical Learning

In [1]:
import janitor as jn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import interpolate

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

## 2.1 What Is Statistical Learning?

### FIGURE 2.1.

In [3]:
# advertising = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/Advertising.csv')
advertising = pd.read_csv('../data-islr/Advertising.csv', index_col=0)
advertising.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data-islr/Advertising.csv'

In [None]:
_, axes = plt.subplots(1, 3, figsize=(12, 5), constrained_layout=True)

sns.regplot(data=advertising,
            x='TV',
            y='sales',
            scatter_kws={'color': 'red'},
            ax=axes[0])
sns.regplot(data=advertising,
            x='radio',
            y='sales',
            scatter_kws={'color': 'red'},
            ax=axes[1])
sns.regplot(data=advertising,
            x='newspaper',
            y='sales',
            scatter_kws={'color': 'red'},
            ax=axes[2])

### FIGURE 2.2.

In [None]:
# income = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/Income1.csv')
income = pd.read_csv('../data-islr//Income1.csv', index_col=0)
income.head()

In [None]:
_, axes = plt.subplots(1, 2, figsize=(10, 5), constrained_layout=True)


for i in range(2):

    g = sns.scatterplot(data=income,
                         x='Education',
                         y='Income',
                         color='red',
                         ax=axes[i])
    g.set(xlabel='Years of Education')


lowess = sm.nonparametric.lowess(endog=income.Income,
                                 exog=income.Education,
                                 frac=0.5)

axes[1].plot(lowess[:, 0], lowess[:, 1])

axes[1].vlines(x=lowess[:, 0],
               ymin=np.fmin(income.Income, lowess[:, 1]),
               ymax=np.fmax(income.Income, lowess[:, 1]))

### FIGURE 2.3.

3D plotting is never an easy job

In [None]:
# income = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/Income2.csv')
income2 = pd.read_csv('../data-islr//Income2.csv', index_col=0)
income2.head()

In [None]:
formula = "Income ~ Education + Seniority + Education*Seniority"
result = smf.ols(formula=formula, data=income2).fit()
print(result.params)

In [None]:
result.summary()

### FIGURE 2.4.

In [None]:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')

xx = income2['Education']
yy = income2['Seniority']
zz = income2['Income']

def generate_mesh2d(array):
     
    new_array = []
    
    for i in range(len(array)):
        
        # 如下生成meshgrid数据，并保证xx和yy一定出现在meshgrid中
        x_space = (array[i].max() - array[i].min()) / 30
        xx_sort = np.sort(array[i])
        
        n = len(array[i])
        XX = [array[i].min()]
        
        for i in range(n - 1):
            space = xx_sort[i + 1] - xx_sort[i]
            # 加判断，< 1 时，区间 * 2
            n_interval = int(space // x_space)
            seq = list(np.linspace(xx_sort[i], xx_sort[i + 1], n_interval + 1))
            XX.extend(seq[1:])

        XX = np.array(XX)
        new_array.append(XX)
        
    return new_array

XX, YY = generate_mesh2d([xx, yy])
X, Y = np.meshgrid(XX, YY)

exog = pd.DataFrame({'Education': X.ravel(), 'Seniority': Y.ravel()})
Z = np.array(result.predict(exog=exog)).reshape(X.shape)

ax.plot_surface(X=X, Y=Y, Z=Z, rcount=1, ccount=1, color='#F2D711', alpha=0.8)
ax.plot_wireframe(X=X,
                  Y=Y,
                  Z=Z,
                  rstride=1,
                  cstride=1,
                  lw=0.2,
                  color='k',
                  alpha=1)

ax.scatter3D(xs=xx, ys=yy, zs=zz, color='r', alpha=1)

# 画误差线
for value1, value2, value3 in zip(xx, yy, zz):
    idxx = np.argmin(np.abs(XX - value1))
    idxy = np.argmin(np.abs(YY - value2))
    X0 = XX[idxx]
    Y0 = YY[idxy]
    Z0 = Z[idxy, idxx]
    ax.plot3D((value1, X0), (value2, Y0), (value3, Z0), 'black')

ax.zaxis._axinfo['juggled'] = (1, 2, 0)
ax.set_xlabel('Years of Education')
ax.set_ylabel('Seniority')
ax.set_zlabel('Income')
ax.xaxis.pane.set_alpha(1)
ax.yaxis.pane.set_alpha(1)
ax.zaxis.pane.set_alpha(1)
ax.xaxis.pane.set_edgecolor('black')
ax.yaxis.pane.set_edgecolor('black')
ax.zaxis.pane.set_edgecolor('black')
ax.xaxis.pane.set_fc('white')
ax.yaxis.pane.set_fc('white')
ax.zaxis.pane.set_fc('white')
ax.grid(0)
plt.show()

### FIGURE 2.5.

### FIGURE 2.6.

### FIGURE 2.7.

### FIGURE 2.8.

## 2.2 Assessing Model Accuracy

## 2.3 Lab: Introduction to Python

Python uses functions to perform operations. To run a function called `funcname`, we type `funcname(input1, input2)`, where the inputs (or _arguments_) `input1` and `input2` tell Python how to run the function. A function can have any number of inputs. For example, to create a vector of numbers, we use the function `np.array()` from the module `NumPy`. To do this, we need to pass a `list` as an argument to the function, lists are a built-in data type in Python. Here we assume you are already familiar with basic Python objects.

In [None]:
# Define a new vector x
x = np.array([1, 6, 2])
x

Typing the `help(funcname)` will cause Jupyter to pop up a help panel with additional information about the function `funcname`.

In [None]:
# help(np.array)

We now make a second vector `y`.

In [None]:
y = np.array([1, 4, 3])

We can tell Python to add two sets of numbers together. It will then add the first number from `x` to the first number from `y`, and so on. However, `x` and `y` should be the same length. We can check their length using the `len()` function.

In [None]:
len(x)

In [None]:
len(y)

In [None]:
x + y

The "magic" `%whos` command allows us to look at a list of all the objects, such as data and functions, that we have saved so far.

In [None]:
%whos

The command `del` can be used to delete any that we don't want.

In [None]:
# del x

In [None]:
%whos

It's also possible to remove all objects at once.

In [None]:
# %reset

In [None]:
# %whos

The `np.matrix()` function can be used to create a matrix of numbers. Before we use the `np.matrix()` function we can learn more about it.

In [None]:
# help(np.matrix)

The help reveals that the `matrix()` function can take a number of inputs, but for now we focus on how to build a simple matrix. To build a matrix, we can input a list of lists as a parameter. Each list is a row.

In [None]:
x = np.matrix([[1, 3], [2, 4]])
x

If `data` is a string, it is interpreted as a matrix with commas or spaces separating columns, and semicolons separating rows.

In [None]:
x = np.matrix('1, 3; 2, 4')
x

The `np.sqrt()` function return the square root of each element of a vector or a matrix. The function `np.power(x, 2)` raises each element of `x` to the power of 2; any powers are possible, including fractional or negative powers.

In [None]:
np.sqrt(x)

In [None]:
np.power(x, 2)

In [None]:
from numpy import random

The `random.normal()` function generates a vector of random normal variables, with third argument `size` the sample size. Each time we call this function, we will get a different answer. Here we create two correlated sets of numbers, `x` and `y`, and use the `np.corrcoef()` function to compute the correlation matrix between them.

In [None]:
x = random.normal(size=50)
y = x + random.normal(size=50, loc=50, scale=.1)
np.corrcoef(x, y)

By default, `random.normal` creates random variables from standard normal distribution with mean of 0 and standard deviation of 1. However, the mean and standard deviation can be altered using the `loc` and `scale` arguments, as illustrated above. Sometimes we want our code to reproduce the exact same set of random numbers; we can use the `random.seed()` function to do this. The `random.seed()` function takes an (arbitrary) integer argument.

In [None]:
random.seed(1303)
random.normal(size=50)

We use `random.seed()` throughout the labs whenever we perform calculations involving random quantities in order to obtain reproducible results.

The `np.mean()` and `np.var()` functions can be used to compute the mean and variance of a vector of numbers. Applying `np.sqrt()` to the output of `np.var()` will give the standard deviation (or we can use `np.std()`).

In [None]:
random.seed(3)
y = random.normal(size=100)
np.mean(y)

In [None]:
np.var(y)

In [None]:
np.std(y)

### Graphics

The `plt.plot` function is the primary way to plot data in Python. For instance, `plt.plot(x, y, 'o')` produces a scatterplot of the numbers in `x` versus the numbers in `y`. There are many additional options that can be passed in to the `plt.plot()` function, and many other functions that alter the appearance of the plot. For example, the `plt.xlabel()` function will result on a label in the x-axis. A format string consists of a part for color, marker and line: `fmt = '[marker][line][color]'`

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = random.normal(size=100)
y = random.normal(size=100)
plt.scatter(x, y)
plt.xlabel('This is the x-axis')
plt.ylabel('This is the y-axis')
plt.title('Plot of X vs Y')

We will often want to save the output of a Python plot. We do this with the `plt.savefig()` function. We can choose the type of format to output by changing the extension of the file name. For instance, to create a pdf, we use `plt.savefig('output.pdf')`, and to create a jpeg, we use `plt.savefig('output.jpeg')`.

In [None]:
plt.scatter(x, y, c='g')
# plt.savefig('Figure.pdf')

The function `np.arange` can be used to create a sequence of numbers. For instance, `np.arange(a, b)` makes a vector of integers between `a` and `b`, excluding `b`. Another funciton `np.linspace(a, b, n)` makes a sequence of `n` numbers that are equally spaced between `a` and `b`.

In [None]:
np.arange(1, 11)

In [None]:
np.arange(1, 11, 2)

In [None]:
x = np.linspace(-np.pi, np.pi, 50)
x

We will now create some more sophisticated plots. For instance, the `plt.contour()` function produces a contour plot in order to represent three-dimensional data; it is like a topographical map. It takes three arguments:
1. A vector of the `x` values (the first dimension),
2. A vector of the `y` values (the second dimension), and
3. A matrix of the `z` values (the third dimensions) for each pair of (`x`, `y`) coordintes.

As with the `plt.plot()` function, there are many other inputs that can be used to fine-tune the output of the `plt.contour()` function. To learn more about these, take a look at the help file by typing `?plt.contour`.

In [None]:
y = x
f = np.matrix([[np.cos(j) / (1 + i**2) for j in y] for i in x])
_ = plt.contour(x, y, f)

In [None]:
fa = (f - f.T) / 2
_ = plt.contour(x, y, fa, 15)

In [None]:
np.shape(fa)

The `plt.contourf()` function works the same way as `plt.contour()`, except that it produces a color-coded plot whose colors depend on the `z` value. This is known as heatmap, and is sometimes used to plot temperature in weather forecasts. Alternatively, `plt.` can be used to produce a three-dimensional plot. 

In [None]:
_ = plt.contourf(x, y, fa)

The function `axes3d` create 3D plots, where `elev` and `azim` values control the angles to view the plots.

In [None]:
X, Y = np.meshgrid(x, y)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.view_init(elev=30)
ax.plot_wireframe(X, Y, fa)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.view_init(elev=30, azim=20)
ax.plot_wireframe(X, Y, fa)

### Indexing Data

We often wish to examine part of a set of data. Suppose that our data is stored in the matrix `A`.

In [None]:
A = np.arange(1, 17).reshape(4, 4)
A = A.T
A

Then, type

In [None]:
A[1, 2]

will select the element corresponding to the second row and the third column. The first number after the open-bracket symbol [ always refers to the row, and the second number always refers to the column.

In [None]:
A[[0, 2]][:, [1, 3]]

In [None]:
A[0:3, 1:4]

In [None]:
A[0:2, :]

In [None]:
A[:, 0:2]

The last two examples include either `:` for the columns or `:` for the rows. These indicate that Python should include all columns or all rows, respectively. Python treats a single row or column of a matrix as a vector.

In [None]:
A[0,]

The wave symbol `~` in indeces tells Python to exclude the rows or columns that you specifiy.

In [None]:
A[[~2, ~0]]

In [None]:
A[[~2, ~0]][:, [1]]

In [None]:
A

The `np.shape()` function outputs the number of rows followed by the number of columns of a given matrix

In [None]:
np.shape(A)

### Loading Data

For most analyses, the first step involves importing a data set into Python. The `pd.read_table()` function is one of the primary ways to do this. The help file contains details about how to use this function. We can export data using the `.to_{format}` method.

Before attempting to load a data set, we must make sure that Python knows to search for the data in the proper directory. We can do this by specifying the path in the argument of the `pd.read_table()` function. We begin by loading in the Auto data set. The following command will load the `Auto.csv` file into Python and store it as an object called `Auto`, in a format referred to as a `data frame`.

In [None]:
import pandas as pd

In [None]:
# auto = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/Auto.data')
auto = pd.read_csv('../data/Auto.csv')
auto.head()

In [None]:
auto.shape

Note that `Auto.csv` is simply a text file, which you could alternatively open on your computer using a standard text editor. It is often a good idea to view a data set using a text editor or other software such as Excel before loading it into Python.

The data set also includes a number of missing observations, indicated by a question mark '?'. Missing values are a common occurrence in real data sets.

In [None]:
auto.values

In [None]:
auto.query("horsepower=='?'")

One option is to replace this values with `NaN`.

In [None]:
auto = auto.replace('?', np.NaN)

The `shape` method tells us that the data has 397 observations, or rows, and nine variables, or columns. There are various ways to deal with the missing data. In this case, only five of the rows contain missing observations, and so we choose to use the `dropna` method to simply remove these rows.

In [None]:
auto = auto.dropna()

In [None]:
auto.shape

Once the data are loaded correctly, we can use the method `columns` to check the variable names.

In [None]:
auto.columns

### Additional Graphical and Numerical Summaries

We can use the plot() function to produce scatterplots of the quantitative variables. However, simply typing the variable names will produce an error message, because Python does not know to look in the Auto data set for those variables.

To refer to a variable, we must type the data set and the variable name joined with a `.` symbol.

In [None]:
import seaborn as sns

In [None]:
sns.scatterplot(data=auto, x='cylinders', y='mpg')

In [None]:
auto.cylinders = auto.cylinders.astype('category')

In [None]:
sns.boxplot(data=auto, x='cylinders', y='mpg')

In [None]:
sns.histplot(data=auto, x='mpg', bins=15, color='red')

In [None]:
sns.pairplot(auto)

In [None]:
var_list = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']

sns.pairplot(data=auto, vars=var_list)

In [None]:
g = sns.scatterplot(data=auto, x='horsepower', y='mpg')

for i, txt in enumerate(auto.mpg):
    plt.annotate(txt, (auto.horsepower.iat[i], auto.mpg.iat[i]))

The `describe()` method produces a numerical summary of each variable in a particular data set.

In [None]:
auto.describe(include='all')

In [None]:
auto.name.describe()

In [None]:
auto.mpg.describe()

## 2.4 Exercises

### Conceptual

#### 1. 

For each of parts (a) through (d), indicate whether we would generally expect the performance of a flexible statistical learning method to be better or worse than an inflexible method. Justify your answer.

(a) 
Better. As the level of flexibility increases, the curve fit the observed data more closely and a better fit would be obtained with the large sample size.

(b)
Worse. The flexible statistical learning method would overfit the data with extremely large number of predictors p and small number of observations n.

(c)
Better. If the relationship between the predictors and response is highly non-linear, an inflexible statistical learning method would lead to high training MSE, while a flexible method may fit the data better.

(d)
Worse. A flexible statistical learning method would fit the noise in the error terms and increase variance.

#### 2. 

Explain whether each scenario is a classification or regression problem, and indicate whether we are most interested in inference or prediction. Finally, provide n and p.

(a)
Regression problem. We are interested in inference, say, how the CEO salary is affected as profit, number of employees and industry change.
n - 500 firms in the US
p - profit, number of employees, industry

(b)
Classification problem. We are interested in prediction. To predict a new product and will be a success or a failure.
n - 20 similar products previously launched
p - price charged, marketing budget, comp. price, ten other variables

(c)
Regression problem. We are interested in inference, that is to say, we want to know how the % change in the US dollar in relation to the weekly changes in the world stock markets.
n - 52 weeks of 2012 weekly data
p - % change in US market, % change in British market, % change in German market

#### 3. 

We now revisit the bias-variance decomposition.

(a)

(b)
all 5 lines >= 0

i. (squared) bias - decreases monotonically because increases in flexibility
yield a closer fit

ii. variance - increases monotonically because increases in flexibility yield
overfit

iii. training error - decreases monotonically because increases in flexibility
yield a closer fit

iv. test error - concave up curve because increase in flexibility yields a closer
fit before it overfits

v. Bayes (irreducible) error - defines the lower limit, the test error is bounded 
below by the irreducible error due to variance in the error (epsilon) in the output 
values (0 <= value). When the training error is lower than the irreducible error,
overfitting has taken place.
The Bayes error rate is defined for classification problems and is determined by 
the ratio of data points which lie at the 'wrong' side of the decision boundary, 
(0 <= value < 1).

#### 4. 

You will now think of some real-life applications for statistical learning.

#### 5. 

What are the advantages and disadvantages of a very flexible (versus a less flexible) approach for regression or classification? Under what circumstances might a more flexible approach be preferred to a less flexible approach? When might a less flexible approach be preferred?

The advantages for a very flexible approach for regression or classification are obtaining a better fit for non-linear models, decreasing bias.

The disadvantages for a very flexible approach for regression or classification are requires estimating a greater number of parameters, follow the noise too closely (overfit), increasing variance.

A more flexible approach would be preferred to a less flexible approach when we are interested in prediction and not the interpretability of the results.

A less flexible approach would be preferred to a more flexible approach when we are interested in inference and the interpretability of the results.

#### 6. 

Describe the differences between a parametric and a non-parametric statistical learning approach. What are the advantages of a parametric approach to regression or classification (as opposed to a non-parametric approach)? What are its disadvantages?

A parametric approach reduces the problem of estimating f down to one of estimating a set of parameters because it assumes a form for f.

A non-parametric approach does not assume a functional form for f and so requires a very large number of observations to accurately estimate f.

The advantages of a parametric approach to regression or classification are the simplifying of modeling f to a few parameters and not as many observations are
required compared to a non-parametric approach.

The disadvantages of a parametric approach to regression or classification are a potential to inaccurately estimate f if the form of f assumed is wrong or
to overfit the observations if more flexible models are used.

#### 7. 

The table below provides a training data set containing six observations, three predictors, and one qualitative response variable.

(a)

In [4]:
obs0 = np.array([0, 0, 0])
obs1 = np.array([0, 3, 0])
obs2 = np.array([2, 0, 0])
obs3 = np.array([0, 1, 3])
obs4 = np.array([0, 1, 2])
obs5 = np.array([-1, 0, 1])
obs6 = np.array([1, 1, 1])

obs = [obs1, obs2, obs3, obs4, obs5, obs6]
dist = {}

for i in range(6):
    dist['obs' + str(i + 1)] = np.sqrt(np.sum((obs[i] - obs0)**2))
    print('obs' + str(i + 1) + ':', dist['obs' + str(i + 1)])

obs1: 3.0
obs2: 2.0
obs3: 3.1622776601683795
obs4: 2.23606797749979
obs5: 1.4142135623730951
obs6: 1.7320508075688772


(b), (c)

In [5]:
sorted(dist.items(), key=lambda item: item[1])

[('obs5', 1.4142135623730951),
 ('obs6', 1.7320508075688772),
 ('obs2', 2.0),
 ('obs4', 2.23606797749979),
 ('obs1', 3.0),
 ('obs3', 3.1622776601683795)]

prediction = Green;

prediction = Red;

(d)

Small. A small K would be flexible for a non-linear decision boundary,
whereas a large K would try to fit a more linear boundary because it takes more
points into consideration.

### Applied

#### 8. College dataset

(a)

In [6]:
import janitor as jn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [7]:
# college = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv')
college = pd.read_csv('../data-islr//college.csv')
college.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data-islr//college.csv'

(b)

In [8]:
college = college.rename_column('Unnamed: 0', 'College')
college.head()

NameError: name 'college' is not defined

(c) i

In [9]:
college.describe()

NameError: name 'college' is not defined

(c) ii

In [10]:
sns.pairplot(college.iloc[:, 1:11])

NameError: name 'college' is not defined

(c) iii

In [11]:
sns.boxplot(data=college, x='Private', y='Outstate')

NameError: name 'college' is not defined

(c) iv

In [12]:
college.add_column('Elite', value=(college.Top10perc > 50));

NameError: name 'college' is not defined

In [None]:
college.Elite.astype('bool')

In [None]:
sns.boxplot(college.Elite, college.Outstate);

(c) v

In [13]:
# Normalization
def norm(df):
    return (df - df.mean()) / (df.std())


features = 13
college_norm = norm(college.iloc[:, 2:features + 1])
college_norm.head()

NameError: name 'college' is not defined

`pandas.melt()` Unpivot a DataFrame from wide format to long format, optionally leaving identifier variables set.

In [14]:
college_melt = college_norm.pivot_longer(values_to='vals')
college_melt.head()

NameError: name 'college_norm' is not defined

In [None]:
sns.displot(data=college_melt,
            x='vals',
            col='variable',
            col_wrap=4,
            kde=True,
            stat="density",
            common_norm=False)

In [None]:
g = sns.FacetGrid(college_melt, col='variable', col_wrap=4)
g.map(sns.distplot, 'vals')
g.set(xlim=(-4, 4))

#### 9. Auto dataset

In [15]:
auto = pd.read_csv('../data-islr/Auto.csv')
auto.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data-islr/Auto.csv'

In [None]:
auto[auto.values == '?']
auto = auto.replace('?', np.NaN)
auto = auto.dropna()

In [None]:
# (a) Which of the predictors are quantitative, and which are qualitative?
datatypes = {
    'quant': [
        'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
        'acceleration'
    ],
    'qual': ['origin', 'name']
}

auto_quant = auto[datatypes['quant']].astype(np.float_)

In [None]:
# (b) What is the range of each quantitative predictor?
# (c) What is the mean and standard deviation of each quantitative predictor?

pd.DataFrame({
    'range': auto_quant.max() - auto_quant.min(),
    'mean': auto_quant.mean(),
    'std': auto_quant.std()
})

In [None]:
# (d) Now remove the 10th through 85th observations.
# What is the range, mean, and standard deviation of each predictor
# in the subset of the data that remains?

# drop observations 10 to 85
auto_quant_d = auto_quant.drop(auto_quant.index[10:85])
# results as dataframe
pd.DataFrame({
    'range': auto_quant_d.max() - auto_quant_d.min(),
    'mean': auto_quant_d.mean(),
    'std': auto_quant_d.std()
})

In [None]:
# (e) Using the full data set, investigate the predictors graphically,
# using scatterplots or other tools of your choice. Create some plots
# highlighting the relationships among the predictors. Comment on your findings.

# combine numeric quantitive data with original qualitive data
auto_n = pd.concat([auto_quant, auto[datatypes['qual']]], axis=1)
# pairplot grid
sns.pairplot(auto_n)

(f)

Looking at the plots above, displacement, horsepower and weight seem most strongly correlated with mpg. It also seems to be a discernible relationship with cylinders and origin which we might try including in our analysis.

Accelration is not strongly correllated with mpg, so we might consider dropping this feature.

#### 10. Boston dataset

(a)

In [16]:
from sklearn.datasets import load_boston

In [17]:
boston_raw = load_boston()
print(boston_raw.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [18]:
print(boston_raw.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [19]:
boston_data = np.column_stack([boston_raw.data, boston_raw.target])
col_names = np.append(boston_raw.feature_names, 'MEDV')

# create the data frame
boston = pd.DataFrame(boston_data, columns=col_names)
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [20]:
boston.shape

(506, 14)

(b)

In [None]:
sns.pairplot(boston)

(c)

In [None]:
corr_matrix = boston.corr()
corr_matrix.CRIM.sort_values(ascending=False)

(d)

In [None]:
sns.histplot(boston.CRIM)

In [None]:
sns.histplot(boston.TAX)

In [None]:
sns.histplot(boston.PTRATIO)

In [None]:
sum(boston.CHAS == 1)

(f)

In [None]:
boston.PTRATIO.median()

(g)

In [None]:
boston.MEDV.min(), boston.MEDV.idxmin()

In [None]:
print(boston.iloc[398])

(h)

In [None]:
sum(boston.RM > 7)

In [None]:
sum(boston.RM > 8)

In [None]:
UpperClass = boston.query('RM>8')
UpperClass.describe()