In [None]:
# %load standard_import.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline
plt.style.use('seaborn-white')

# Some Stats

- Expectation: 
$$\begin{eqnarray}\mu_X & =  & E[X] \\ & = & \sum w_i \cdot x_i\end{eqnarray}$$ where $x_i$ are samples of $X$, $w_i$ is a weight, or probability, of $x_i$ showing up, and $\sum w_i = 1$.

Usually, we look at $\mu_X$ as just the mean, and $w_i=\frac{1}{N}$, so $\mu_X = \frac{1}{N}\sum x_i$. Below, $\mu_X$ refers to the mean.

- Covariance: 
$$\begin{eqnarray}\sigma_{XY}^2 & = & E[(X-E[X])\cdot(Y-E[Y])] \\ & = & E[XY] - E[X]E[Y] \\ & = & \mu_{XY} - \mu_X\mu_Y\end{eqnarray}$$

- Variance:
$$\begin{eqnarray}\sigma_X^2 & = & Var[X] \\ & = & E[(X-E[X])^2] \\ & = & E[X^2]-E[X]^2 \\ & = & \mu_{X^2}-\mu_X^2\end{eqnarray}$$

- Standard deviation: Square root of variance, i.e., $\sigma_X$.

- Correlation:
$$r_{XY}=\frac{\sigma_{XY}^2}{\sigma_X\cdot\sigma_Y}$$

In [None]:
wage = pd.read_csv('Data/Wage.csv')
wage.describe()

In [None]:
x=wage["age"]
y=wage["wage"]

The full nelson for $r_{xy}$!

In [None]:
((x*y).mean()-x.mean()*y.mean())/(np.sqrt(((x-x.mean())**2).mean())*np.sqrt(((y-y.mean())**2).mean()))

A bit simpler this way...

In [None]:
np.cov(x,y)[0,1]/(x.std()*y.std())

... or this way!

In [None]:
wage.corr()

It is sometime nice to look at these variables as "standard normal", and we achieve this by "normalizing" these sample vectors:

In [None]:
# ... as standard normal variables...
xx=(x-x.mean())/x.std()
yy=(y-y.mean())/y.std()

In [None]:
sns.scatterplot(data=wage, x="age", y="logwage", alpha=0.1);

Distributions visualization tell the full story though...

In [None]:
sns.distplot(xx, label='xx');
sns.distplot(yy, label='yy');
plt.legend();

When in standard normal form, the correlation coefficient is easier to think about: $E[xx\cdot yy]$

In [None]:
n=len(xx)
1/(n-1)*np.dot(xx,yy)

In [None]:
np.correlate(xx,yy)

In [None]:
np.cov(x,y)