# Principle Component Analysis - an example in 2  dimensions

This example is provided in the excellent book by Daniel S. Wilks, "Statistical Methods in the Atmospheric Sciences", Academic Press.

In [None]:
# Define the tools we are going to need today
%matplotlib inline
import matplotlib.pyplot as plt  # plotting library
import numpy as np  # numerical library
from matplotlib.mlab import PCA
import pandas as pd
import seaborn as sns
sns.set_context('talk')
sns.set_style('ticks')
pd.options.display.float_format = '{:.2f}'.format
plt.rcParams['figure.figsize'] = (12, 5)  # Default plot size

## Data 

In [None]:
# This just reads the data from an url
import io, requests
url = 'https://www.dropbox.com/s/h5ldd6gvx5ub808/wilks_data.csv?dl=1'
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')), index_col=0)

These are the daily minimum temperatures at two locations close to New York. They are in Fahrenheit so we convert them:

In [None]:
df = df[['Ithaca Min Temp', 'Canandaigua Min Temp']]
df = (df - 32) / 1.8

In [None]:
df.plot();
plt.ylabel('Min Temp °C');

## Center and plot 

In [None]:
df = df - df.mean()

In [None]:
f, ax = plt.subplots(1, figsize=(8, 8))
df.plot(ax=ax, kind='scatter', x='Ithaca Min Temp', y='Canandaigua Min Temp', s=50);
ax.set_xlim([-15, 15])
ax.set_ylim([-15, 15]);

We see that the standard deviation is higher for Ithaca than Canandaigua, but the two correlate well:

In [None]:
df.std()

In [None]:
df.corr()

## Principle components analysis: eigenvectors, or "EOF"

This will do the computations for us:

In [None]:
pc = PCA(df.values, standardize=False)

The two eigenvectors e1 and e2 are found in the Wt attribute:

In [None]:
print('e1', pc.Wt[0])
print('e2', pc.Wt[1])

In [None]:
f, ax = plt.subplots(1, figsize=(8, 8))
df.plot(ax=ax, kind='scatter', x='Ithaca Min Temp', y='Canandaigua Min Temp', s=50);
ax.set_xlim([-15, 15])
ax.set_ylim([-15, 15]);
# Plot the exagerated eigenvectors:
fac = 14.
ax.arrow(0, 0, -pc.Wt[0][0]*fac, -pc.Wt[0][1]*fac, head_width=0.5, head_length=0.8)
ax.arrow(0, 0, pc.Wt[1][0]*fac, pc.Wt[1][1]*fac, head_width=0.5, head_length=0.8);

## Principle components

The two principle components time series can be retrieved too:

In [None]:
df['pc1'] = pc.Y[:, 0]
df['pc2'] = pc.Y[:, 1]

In [None]:
df.corr()**2

In [None]:
df[['pc1', 'pc2']].plot();

## Display the principle components in the original phase space 

In [None]:
pcx1, pcy1 = pc.Wt[0].reshape((2, 1)).dot(pc.Y[:, 0].reshape((1,31)))
pcx2, pcy2 = pc.Wt[1].reshape((2, 1)).dot(pc.Y[:, 1].reshape((1,31)))

In [None]:
f, ax = plt.subplots(1, figsize=(8, 8))
df.plot(ax=ax, kind='scatter', x='Ithaca Min Temp', y='Canandaigua Min Temp', s=50);
ax.set_xlim([-15, 15])
ax.set_ylim([-15, 15]);
# Plot the exagerated eigenvectors:
fac = 14.
ax.arrow(0, 0, -pc.Wt[0][0]*fac, -pc.Wt[0][1]*fac, head_width=0.5, head_length=0.8)
ax.arrow(0, 0, pc.Wt[1][0]*fac, pc.Wt[1][1]*fac, head_width=0.5, head_length=0.8);
# The principle components
ax.scatter(pcx1, pcy1, c='C3', s=50)
ax.scatter(pcx2, pcy2, c='C2', s=50);