In [None]:
# %load standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')

# Chapter 1 - Introduction

- [General](#General-Concepts)
- [Notation](#Notation)
- [Load Datasets](#Load-Datasets)

### General Concepts

Since we'll be using python, as we go through the ideas and concepts, I will sprinkle in use of `numpy`, `pandas`, and other core libraries or graphics functions. Remember, any time you have a question about a fucntion used, just type `?<function_name>` and you'll be given all the detail you'll ever want in most cases.

The ISLR text starts with the following high-level concepts in the intro, so let's discuss each one briefly before moving on:
- supervised vs. unsupervised
- continuous/quantitative vs. categorical/qualitative
- regression vs. classification

### Notation

#### Dimensions
- $n$ is number of samples
- $p$ is number of features

In [None]:
n, p = 100, 3

#### Vectors (notation in text, not in notebook...)
- general: $x_i$
- when dimension is $n$, $\mathbf{x}$

In [None]:
a = np.random.randn(p)  # numpy has this nice way of generating some random data, args are the dimensions
b = np.random.randn(p)

print(f"a   = {a}")
print(f"b   = {b}")
print(f"a+b = {a+b}")
print(f"a*b = {a*b}")

#### Dot (inner) product
$a\cdot b = \sum a_i \cdot b_i = a_0 \cdot b_0 + a_1 \cdot b_1 + \cdots$

In [None]:
print(f"a . b = {np.dot(a,b)}")

#### Matrix notation

$X = \begin{pmatrix}
 x_{00} & x_{01} & \dots & x_{0p_{-1}}\\
 x_{10} & x_{11} & \dots & x_{1p_{-1}}\\
 \vdots & \vdots & \ddots & \vdots \\
  x_{n_{-1}0} & x_{n_{-1}1} & \dots & x_{n_{-1}p_{-1}}\\
 \end{pmatrix}$

In [None]:
X = np.random.randn(n,p)           # two args means 2-D data...
print(f"dim(X)={np.shape(X)}\n")
print(f"first 10 rows of X\n\n{X[:10]}")

In [None]:
X[3,1]

In [None]:
A = np.matrix([[1,2],[3,4]])
B = np.matrix([[5,6],[7,8]])

print(f"A\n{A}\n")
print(f"B\n{B}\n")
print(f"A+B\n{A+B}\n")
print(f"A*B, remember C_ij = row(A,i) . col(B,j)\n{A*B}\n")

print(f"B*A\n{B*A}\n")

print("NOTE: A*B != B*A in general\n")

print(f"A = transpose(A)\n{np.transpose(A)}")

**NOTE**: to multiply matrices $D$ and $E$, you must have the dimensions correct. If $D$ is $q \times r$, then you must have $E$ be $r \times s$ in order to talk about $D*E$.

In [None]:
D = np.matrix([[1,2,3],[4,5,6]])
E = np.matrix([[1,2,3,4],[5,6,7,8],[9,10,11,12]])

print(f"dim(D)={np.shape(D)}\n")
print(f"D\n{D}\n")

print(f"dim(E)={np.shape(E)}\n")
print(f"E\n{E}\n")

print(f"D*E\n{D*E}\n")

... but ...

In [None]:
try:
    E*D
except:
    import sys
    print(f"This results in error because dimensions don't match:\n\n{sys.exc_info()[1]}")

### Some Data

In [None]:
wage = pd.read_csv('Data/Wage.csv')
n, p = len(wage), len(wage.columns)
wage.head()

In [None]:
wage.describe()

In [None]:
sns.pairplot(wage, hue="education", palette="pastel");

In [None]:
sns.pairplot(wage[["age","wage","education"]], hue="education", palette="pastel", height=4, aspect=1.3);

In [None]:
sns.pairplot(wage[["age","wage","education"]][wage["wage"] > 250], hue="education", palette="pastel", height=4, aspect=1.3);

In [None]:
sns.boxplot(data=wage, x="education", y="wage", palette="pastel");

In [None]:
sns.violinplot(data=wage, x="race", y="wage", palette="pastel");

In [None]:
sns.regplot(data=wage, x="age", y="logwage");

In [None]:
wage.corr()

In [None]:
?pd.DataFrame.corr

In [None]:
x=wage["age"]
y=wage["wage"]

In [None]:
# correlation coeff...
np.cov(x,y)[0,1]/(x.std()*y.std())

In [None]:
def ms(v):
    return v.mean(), v.std()

xs = ms(x)
ys = ms(y)

In [None]:
print(xs,ys,sep="\n")

In [None]:
# ... as standard normal variables...
xx=(x-xs[0])/xs[1]
yy=(y-ys[0])/ys[1]

In [None]:
sns.distplot(xx, label='xx');
sns.distplot(yy, label='yy');
plt.legend();

In [None]:
# correlation coefficient is easier when standard normal forms...
1/(n-1)*np.dot(xx,yy)