In [None]:
import pandas as pd
import numpy as np
import numpy.linalg as la

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

import matplotlib.pyplot as plt
from matplotlib import cm as cm
import seaborn as sns
sns.set(font_scale=2)
plt.style.use('seaborn-whitegrid')
%matplotlib inline

from io import StringIO

# Principal Component Analysis

In machine learning and data science, Principal Component Analysis (PCA) is a method of analysing datasets to obtain an orthogonal basis that best captures most of the variance of our data.  It is often used to remove extra features or dimensions in large-dimensional datasets, because in a similar vein to SVD, PCA will give us a set of axes with both large contributions and small contributions to our data.

## Example 1: Breast cancer dataset

Importing the data set for characteristics of tumor cells (this is the same dataset used for the MP)

In [None]:
params = ["radius", "texture", "perimeter", "area",
          "smoothness", "compactness", "concavity",
          "concave points", "symmetry", "fractal dimension"];
stats = ["(mean)", "(stderr)", "(worst)"]
labels = ["patient ID", "Malignant/Benign"]

for p in params:
    for s in stats:
        labels.append(p + " " + s)

tumor_data = pd.io.parsers.read_csv("additional_files/breast-cancer-train.dat",header=None,names=labels)

In [None]:
tumor_data

### 1) Creating a smaller subset of points:
Selecting a subset of the data for better visualization and understanding of the method. We will start with six patients and only two of the features:

In [None]:
new_data = pd.DataFrame(tumor_data[["Malignant/Benign", 'smoothness (mean)', 'radius (mean)']][272:278])
new_data

In [None]:
g1 = sns.lmplot('smoothness (mean)', 'radius (mean)', new_data,  hue="Malignant/Benign", scatter_kws={"s": 180}, fit_reg=False, height=8)
ax = g1.axes[0,0]
ax.axis('equal')

# This code snippet is plotting the labels
for i in range(272,278):
    x = new_data['smoothness (mean)'][i] + 0.1
    y = new_data['radius (mean)'][i] + 0.1
    ax.text(x,y,str(i),horizontalalignment='left',size='medium', color='black', weight='semibold', fontsize=16)

### 2) Shift the dataset to center the data:
The first step is to determine the "center" of the dataset (the mean value of each feature):

In [None]:
mean_smooth = new_data['smoothness (mean)'].mean()
mean_radius = new_data['radius (mean)'].mean()

print(mean_smooth,mean_radius)

print(new_data['smoothness (mean)'].std(),new_data['radius (mean)'].std())

In [None]:
g1 = sns.lmplot('smoothness (mean)', 'radius (mean)', new_data,  hue="Malignant/Benign", scatter_kws={"s": 180}, fit_reg=False, height=8)
ax = g1.axes[0,0]
plt.xlim(-2,24)
plt.ylim(-2,24)
for i in range(272,278):
    x = new_data['smoothness (mean)'][i] + 0.1
    y = new_data['radius (mean)'][i] + 0.1
    ax.text(x,y,str(i),horizontalalignment='left',size='medium', color='black', weight='semibold', fontsize=16)
        
ax.scatter(mean_smooth,mean_radius,  s=180, c='r', marker=(5, 2))
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
ax.axhline(y=mean_radius, color='m',linestyle='--')
ax.axvline(x=mean_smooth, color='m',linestyle='--')

And then we "center" the dataset, such that each feature has zero mean

In [None]:
new_data['smoothness (zero mean)'] = new_data['smoothness (mean)'] - new_data['smoothness (mean)'].mean()
new_data['radius (zero mean)'] = new_data['radius (mean)'] - new_data['radius (mean)'].mean()

print(new_data['smoothness (zero mean)'].mean())
print(new_data['radius (zero mean)'].mean())

In [None]:
new_data['smoothness (zero mean)']

Plot the centered data:

In [None]:
g1 = sns.lmplot('smoothness (zero mean)', 'radius (zero mean)', new_data, hue="Malignant/Benign", scatter_kws={"s": 180}, fit_reg=False, height=8)
ax = g1.axes[0,0]
ax.axis('equal')

for i in range(272,278):
    x = new_data['smoothness (zero mean)'][i] + 0.1
    y = new_data['radius (zero mean)'][i] + 0.1
    ax.text(x,y,str(i),horizontalalignment='left',size='medium', color='black', weight='semibold', fontsize=16)

ax.scatter(0,0,  s=200, c='r', marker=(5, 2))

ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')

### 3) Get covariance matrix

PCA wants to find the directions of maximum variance. For that, we will need to first define the covariance matrix:

$ M $: total number of data points

$ N $ : number of features

$Cov({\bf A}) = \frac{1}{M-1} {\bf A}^T {\bf A} $

In [None]:
# centered data
A = new_data[['smoothness (zero mean)', 'radius (zero mean)']]
A

In [None]:
M,N = A.shape

In [None]:
# Using the definition
cov_matrix = (1/(M-1))*A.T@A
print(cov_matrix)

In [None]:
# Or using python built-in function
A.cov()

The variances are on the diagonal (co-variance of a variable with itself), and the sum of the 2 values is the overall variability:

In [None]:
np.diag(cov_matrix).sum()

In [None]:
7.53951/31.35945

PCA replaces the original variables with new variables, called principal components, which are orthogonal (i.e. they have zero covariations) and have variances in decreasing order. To accomplish this, we will use the diagonalization of the covariance matrix:

In [None]:
l,u = la.eig(cov_matrix)
print(l)
print(u)

$$cov({\bf A}) = \begin{bmatrix} -0.40237101 & -0.91547669 \\ -0.91547669 &  0.40237101 \end{bmatrix} \begin{bmatrix} 27.71798127 & 0 \\ 0 &  3.6414725  \end{bmatrix} \begin{bmatrix} -0.40237101 & -0.91547669 \\ -0.91547669 &  0.40237101 \end{bmatrix}^T$$

In [None]:
27.717981/31.3594537

Note that the diagonal sum is still 31.359, which says that the two components account for all the variability.


PCA finds, in the data space, the dimension (direction) with the largest variance out of the overall variance.


In this example, if we reduce the dimension space to include only one variable, the first principal component 27.718, accounts for 88% of the variability

Hence, the largest eigenvalue of the covariance matrix corresponds to the largest variance of the dataset, and the associated eigenvector is the direction of maximum variance. For our example:

In [None]:
g1 = sns.lmplot('smoothness (zero mean)', 'radius (zero mean)', new_data, hue="Malignant/Benign", scatter_kws={"s": 180}, fit_reg=False, height=8)
ax = g1.axes[0,0]
ax.axis('equal')

for i in range(272,278):
    x = new_data['smoothness (zero mean)'][i] + 0.1
    y = new_data['radius (zero mean)'][i] + 0.1
    ax.text(x,y,str(i),horizontalalignment='left',size='medium', color='black', weight='semibold', fontsize=16)

ax.scatter(0,0,  s=200, c='r', marker=(5, 2))

ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')

s = 3

J = 0 # choice of principal direction
x = u[0,J]
y = u[1,J]
ax.arrow(0,0,s*x,s*y,color='black',head_width=0.1, head_length=0.1, fc='r', ec='r', lw=5)

J = 1 # choice of principal direction
x = u[0,J]
y = u[1,J]
ax.arrow(0,0,s*x,s*y,color='black',head_width=0.1, head_length=0.1, fc='m', ec='m', lw=5)

### 4) Singular value decomposition

We know that the eigenvectors of ${\bf A}^T{\bf A}$ are the right singular vectors of ${\bf A}$, or the columns of ${\bf V}$ from the SVD decomposition of ${\bf A}$ (or the rows of V transpose). 

Hence, instead of having to calculate the covariance matrix and solve an eigenvalue problem, we will instead get the reduced form of the SVD!

In [None]:
A

In [None]:
U, S, Vt = np.linalg.svd(A, full_matrices=False)

# variances = eig(covariance) = singular values squared
variances = S**2

print(variances)

# principal directions
pc1_vec = Vt[0,:]
pc2_vec = Vt[1,:]

Note that we don't need to worry about the constant value from the covariance matrix that we are disregarding. The variance values change, but their proportionality remains:

In [None]:
variances[0]/variances.sum()

### 5) Plotting the principal directions using the singular right vectors

In [None]:
g1 = sns.lmplot('smoothness (zero mean)', 'radius (zero mean)', new_data, hue="Malignant/Benign", scatter_kws={"s": 180}, fit_reg=False, height=8)
ax = g1.axes[0,0]

for i in range(272,278):
    x = new_data['smoothness (zero mean)'][i] + 0.1
    y = new_data['radius (zero mean)'][i] + 0.1
    ax.text(x,y,str(i),horizontalalignment='left',size='medium', color='black', weight='semibold', fontsize=16)

ax.scatter(0,0,  s=200, c='r', marker=(5, 2))
ax.axis('equal')

ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')

s = 3

x = pc1_vec[0]
y = pc1_vec[1]
ax.arrow(0,0,s*x,s*y,color='black',head_width=0.1, head_length=0.1, fc='r', ec='r', lw=5)

x = pc2_vec[0]
y = pc2_vec[1]
ax.arrow(0,0,s*x,s*y,color='black',head_width=0.1, head_length=0.1, fc='m', ec='m', lw=5)


### 6) Cumulative explained variance

In [None]:
variances

In [None]:
print(pc1_vec)
print(pc2_vec)

In this example, the largest variance is 138.6 and the direction of this principal component is given by the vector `pc1_vec`.

The second largest variance is 18.2 and the direction of this principal component is given by the vector `pc2_vec`.

In a general problem, we would have many principal components. How can we easily visualize these components and decide how many we will keep in our reduced feature space? 

In [None]:
tot = sum(variances)
var_exp = [(i / tot)*100 for i in variances]
cum_var_exp = np.cumsum(var_exp)

plt.bar(range(len(var_exp)),var_exp, align='center', label='individual explained variance')
plt.step(range(len(var_exp)), cum_var_exp, 'r', where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')


In [None]:
Xstar=(A@Vt.T).values
new_data['pc1'] = Xstar[:,0]  
new_data['pc2'] = Xstar[:,1]  

In [None]:
g1 = sns.lmplot('pc1', 'pc2', new_data, hue="Malignant/Benign", fit_reg=False, height=8, scatter_kws={"s": 180})
ax = g1.axes[0,0]
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
ax.axis('equal')


for i in range(272,278):
    x = new_data['pc1'][i] + 0.1
    y = new_data['pc2'][i] + 0.1
    ax.text(x,y,str(i),horizontalalignment='left',size='medium', color='black', weight='semibold', fontsize=16)


### 7) Complete dataset

But since we have only two features here, PCA is not really helping! Let's go back to original example:

In [None]:
tumor_data.head()

In [None]:
A_large = tumor_data.iloc[:,2:].values

type(A_large)

In [None]:
A_large.mean()

#### Center the mean

In [None]:
A_large.std(axis=0)

In [None]:
# X = (A_large - A_large.mean(axis=0))
# print(X.mean(axis=0))

X = (A_large - A_large.mean(axis=0))/A_large.std(axis=0)
print(X.std(axis=0))

In [None]:
u,s,vt = la.svd(X,full_matrices=False)

variances = s**2

tot = sum(variances)
var_exp = [(i / tot)*100 for i in variances]
cum_var_exp = np.cumsum(var_exp)

plt.bar(range(len(var_exp)),var_exp, align='center', label='individual explained variance')
plt.step(range(len(var_exp)), cum_var_exp, 'r', where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')

In [None]:
cum_var_exp

#### Get the most important principal directions, and transform the original dataset

In [None]:
Vstar = vt[:3,:].T

In [None]:
Xstar=(X@Vstar) # change of basis

Xstar.shape

In [None]:
tumor_data_new = tumor_data.copy()

tumor_data_new['pc1'] = Xstar[:,0]  
tumor_data_new['pc2'] = Xstar[:,1]  
tumor_data_new['pc3'] = Xstar[:,2]  

In [None]:
g1 = sns.lmplot('pc1', 'pc2', tumor_data_new, hue="Malignant/Benign", fit_reg=False, height=8, scatter_kws={"s": 180})
ax = g1.axes[0,0]
ax.axhline(y=0, color='k')
ax.axvline(x=0, color='k')
ax.axis('equal')


In [None]:
from mpl_toolkits.mplot3d import Axes3D

In [None]:
threedee = plt.figure().gca(projection='3d')
threedee.scatter(tumor_data_new['pc1'], tumor_data_new['pc2'], tumor_data_new['pc3'],c = tumor_data_new["Malignant/Benign"])

#### Plot the weight for each feature in the first principal component:

In [None]:
feature_names = tumor_data_new.columns[2:32]

In [None]:
V = vt.T



In [None]:
plt.figure(figsize=(14,6))
plt.bar(feature_names,V[:,0])
plt.xticks(rotation=90);
plt.title('importance of each attribute in ${\\bf p}_1$');

## Example 2: Principal Components of FIFA Dataset
We will be looking at the [FIFA 2018 Dataset](https://www.kaggle.com/thec03u5/fifa-18-demo-player-dataset/kernels).  While this is a video game, the developers strive to make their game as accurate as possible, so this data reflects the skills of the real-life players.

Let's load the data frame using `pandas`.

In [None]:
df = pd.read_csv("additional_files/FIFA_2018.csv",encoding = "ISO-8859-1",index_col = 0, low_memory = False)

We can take a brief look at the data by calling `df.head()`.  The first 34 columns are attributes that describe the behavior (e.g. aggression) or the skills (e.g. ball control), of each player.  The final columns show the player's position, name, nationality, and the club they play for.

The four positions are forward (FWD), midfielder (MID), defender (DEF), and goalkeeper (GK).

In [None]:
df.head()

A higher number signifies that an attribute is more prevalent for that player.  Looking at the above rankings, Player 0 (Christiano Ronaldo) has very good ball control and composure, but is not overly aggressive.

#### Correlation Matrix
We can compute the correlation matrix for these variables across all players using a "heatmap".  Calling `df.corr()` provides this correlation matrix, and `seaborn.heatmap` will do the plotting.

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(),vmin=-1.0,vmax=1.0, linewidth=0.25, cmap='coolwarm');

This heatmap is dark red whenever two variables are positively correlated, and dark blue when they are negatively correlated.  For example, "Sprint Speed" and "Acceleration" are positively correlated.  "Balance" and "Strength" are negatively correlated however.

Notice across the diagonal, all rectangles are dark red.  This is to be expected, as any variable is perfectly correlated with itself.

Also notice that all Goal-Keeping skills are positively correlated with each other, but are negatively correlated with nearly all the other variables.  Maybe we can compress these into a single component/feature through principal component analysis.

# Principal Component analysis

Recall that Principal Component Analysis (PCA) projects high-dimensional data into a low-dimensional representation by finding directions of maximal variance.

Let's first create a new dataframe that includes only the attributes of each player (and not the last four columns of `df`). Store this new dataframe as a variable `X`. 


In [None]:
X = df.iloc[:,:-4].copy()
X

We can get all the attribute names and store them as `labels` by using `.columns.values`

In [None]:
labels = X.columns.values
labels

To perform PCA, we first shift the data so that each attribute has zero mean, then compute the Singular Value Decomposition (SVD) of the resulting data matrix.

Create the data frame `A` where each attribute has zero mean.  Should we ensure each row has zero mean, or each column?

In [None]:
A = X - X.mean()
A.mean()

Now compute the SVD of the resulting matrix.  Make sure you compute the *reduced* SVD.

Once you have computed the SVD, you can plot the fraction of explained variance for each singular value

\begin{equation}
\frac{\sigma_i^2}{\sum_{k=1}^r\sigma_k^2} \hspace{7mm} i = 1,2,\dots,r
\end{equation}

as well as the cumulative explained variance

\begin{equation}
\frac{\sum_{k=1}^i \sigma_k^2}{\sum_{k=1}^r\sigma_k^2} \hspace{7mm} i = 1,2,\dots,r
\end{equation}

You can create a bar plot of the fraction of explained variance for each singular value using `plt.bar`, and a standard line plot for the cumulative explained variance.

In [None]:
U, S, Vt = np.linalg.svd(A, full_matrices = False)
V = Vt.T

variance = S**2
sum_var = sum(variance)
var_exp = [v/sum_var for v in variance]
cum_var = np.cumsum(var_exp)

plt.figure(figsize=(10,6))
plt.bar(range(34),var_exp,label='individual explained variance')
plt.plot(range(34),cum_var,'ro-', label='cumulative explained variance')
plt.legend(loc=5)
plt.xlabel("components")
plt.ylabel("% variance")
plt.show()

In [None]:
cum_var

You should see from the graph that the first principal component is responsible for nearly 60% of the variance, and the first two principal components have well over 70%.

Recall from the SVD that $\mathbf{A}\mathbf{v}_i = \sigma_i\mathbf{u}_i$.  Writing the columns of $\mathbf{A}$ as $\mathbf{a}_k$, this means that:

\begin{equation}
\mathbf{v}_i^{(1)}\begin{bmatrix}\vdots \\ \mathbf{a}_1 \\ \vdots\end{bmatrix} + \mathbf{v}_i^{(2)}\begin{bmatrix}\vdots \\ \mathbf{a}_2 \\ \vdots\end{bmatrix} + \dots + \mathbf{v}_i^{(n)}\begin{bmatrix}\vdots \\ \mathbf{a}_n \\ \vdots\end{bmatrix} = \sigma_i\mathbf{u}_i
\end{equation}

where $\mathbf{v}_i^{(j)}$ is the $j$-th component of $\mathbf{v}_i$. Thus if we define the principal components as 

$${\bf p}_i = \sigma_i\mathbf{u}_i,$$ 


the $i$-th column of $\mathbf{V}$ describes the projection of each attribute onto that principal direction.

We can visualize the weight of each attribute to a given principal component by plotting the entries of the corresponding column of ${\bf V}$. For example, the plot below illustrates the "importance" of each attribute to the first principal component (${\bf p}_1$)



In [None]:
plt.figure(figsize=(14,6))
plt.bar(labels,V[:,0])
plt.xticks(rotation=90);
plt.title('importance of each attribute in ${\\bf p}_1$');

In [None]:
plt.figure(figsize=(14,6))
plt.bar(labels,V[:,1])
plt.xticks(rotation=90);
plt.title('importance of each attribute in ${\\bf p}_2$');

Now, let's add two new columns to the original dataframe `df`, with headers `pc1` and `pc2`. 

Use the expression above to evaluate the first two principal components ${\bf p}_1$ and ${\bf p}_2$

In [None]:
# clear
df['pc1'] = U[:,0]*S[0]
df['pc2'] = U[:,1]*S[1]
df.head()

In [None]:
df.head()

Let's plot the data with these first two principal components.

In [None]:
g = sns.lmplot(x = "pc1", y = "pc2", data = df, hue = "Position", fit_reg=False, height=11, aspect=2, legend=True,
           scatter_kws={'s':14,'alpha':0.5})
ax = g.axes[0,0]
ax.axvline(x=0,color='k', ls = '--')
ax.axhline(y=0,color='k', ls = '--')
plt.show()

It looks like the first principal axis determines whether a player is a goalkeeper or not.  We should double-check to make sure.

What are the attributes of $A$ that are most positively correlated with the first principal component? 

We can answer that by looking at the plot of coefficients above. Or we can do this in a systematic way, by sorting the entries of the column of ${\bf V}$ and finding the ones with highest positive values. 

Find the first 5 attributes, and print their corresponding weights.

In [None]:
ind = np.argsort(V[:,0])
print(ind)
print(labels[ind[-5:]])

You can see that all the goalkeeper attributes are positively correlated with the first principal component.  However, all other attributes, beginning with "Strength" are negatively correlated.  Try plotting the projection of "GK reflexes" onto the first two principal components

In [None]:
labels

In [None]:
np.where((labels == "Finishing") == True)

In [None]:
np.where((labels == "GK reflexes") == True)

In [None]:
g = sns.lmplot(x = "pc1", y = "pc2", data = df, hue = "Position", fit_reg=False, height=11, aspect=2, legend=True,
            markers=["o", "x","^","s"],palette=dict(FWD="g", GK="orange", MID="r", DEF="m"))
ax = g.axes[0,0]
ax.axvline(x=0,color='k', ls = '--')
ax.axhline(y=0,color='k', ls = '--')

s = 400   # this will scale the size of the arrow plot
J = 15        # looking at the position "GK reflexes", corresponding to column 31
x = V[J,0]    # projection of "GK reflexes" onto first principal component
y = V[J,1]    # projection of "GK reflexes" onto second principal component

# make an arrow from the origin to a point at (x,y)
ax.arrow(0,0,s*x,s*y,color='black',width=1)
ax.text(x*s*1.05,y*s*1.05,labels[J],fontsize=24)

J = 9         # looking at the position "GK reflexes", corresponding to column 31
x = V[J,0]    # projection of "GK reflexes" onto first principal component
y = V[J,1]    # projection of "GK reflexes" onto second principal component

# make an arrow from the origin to a point at (x,y)
ax.arrow(0,0,s*x,s*y,color='black',width=1)
ax.text(x*s*1.05,y*s*1.05,labels[J],fontsize=24)

If you plot any other of the GK attributes, they will essentially overlap with GK reflexes.  Check that, by changing the variable `J` above to take the values (11,12,13,14).

Make the same plot as above, but now take a look at other attributes. In the same figure, plot the projections for the attributes in columns [1,8,9,16,28,31]. 

Do you think the results make sense? 

In [None]:
#clear

g = sns.lmplot(x = "pc1", y = "pc2", data = df, hue = "Position", fit_reg=False, height=11, aspect=2, legend=True,
            markers=["o", "x","^","s"],palette=dict(FWD="g", GK="orange", MID="r", DEF="m"))
ax = g.axes[0,0]
ax.axvline(x=0,color='k', ls = '--')
ax.axhline(y=0,color='k', ls = '--')

s = 300   # this will scale the size of the arrow plot

for J in [1,8,9,16,28,31]:

    x = V[J,0]    
    y = V[J,1]    

    # make an arrow from the origin to a point at (x,y)
    ax.arrow(0,0,s*x,s*y,color='black',width=1)
    ax.text(x*s*1.5,y*s*1.1,labels[J],fontsize=24)

## Remove data and re-do PCA

The first principal component seems to mainly dictate whether a player is a goal-keeper or not.  To find out more about the data, we can drop all goal-keepers and repeat PCA.

We first create a new data-frame with all goal-keepers removed:

In [None]:
df2 = df[df["Position"] != "GK"].copy()

Now we remove all the columns associated with the attributes that are mostly associated with goal-keepers. We also remove the columns with `pc1` and `pc2`

In [None]:
df2 = df2.drop(['GK diving',
 'GK handling',
 'GK kicking',
 'GK positioning',
 'GK reflexes','pc1','pc2'],1)

Repeat all the steps from the previous analysis: shift to zero-mean, obtain svd, plot explained variances.

In [None]:
# clear

Y = df2.iloc[:,:-4].copy()

B = Y - Y.mean()
u, s, vt = np.linalg.svd(B, full_matrices = False)
v = vt.T

variance = s**2
sum_var = sum(variance)
var_exp = [vv/sum_var for vv in variance]
cum_var = np.cumsum(var_exp)

plt.figure(figsize=(10,6))
plt.bar(range(29),var_exp,label='individual explained variance')
plt.plot(range(29),cum_var,'ro-',label='cumulative explained variance')
plt.legend(loc=0)
plt.xlabel("components")
plt.ylabel("% variance")
plt.show()

Add the first two components to the data frame and plot them in a scatter plot.

In [None]:
# clear
df2['pc1'] = u[:,0]*s[0]
df2['pc2'] = u[:,1]*s[1]

In [None]:
#clear

g = sns.lmplot(x = "pc1", y = "pc2", data = df2, hue = "Position", fit_reg=False, height=11, aspect=2, legend=True,
            markers=["o", "^","s"],palette=dict(FWD="g",  MID="r", DEF="m"))
ax = g.axes[0,0]
ax.axvline(x=0,color='k', ls = '--')
ax.axhline(y=0,color='k', ls = '--')

Plot the weights of each attribute corresponding to the principal component 1:

In [None]:
#clear

labels_new = Y.columns.values

plt.figure(figsize=(14,6))
plt.bar(labels_new,v[:,0])
plt.xticks(rotation=90);
plt.title('importance of each attribute in ${\\bf p}_1$');

 In the same figure, plot the projections for the attributes in columns [4,9,11,12,14,3,12] onto principal components 1 and 2. 

In [None]:
#clear

g = sns.lmplot(x = "pc1", y = "pc2", data = df2, hue = "Position", fit_reg=False, height=11, aspect=2, legend=True,
            markers=["o", "^","s"],palette=dict(FWD="g",  MID="r", DEF="m"))
ax = g.axes[0,0]
ax.axvline(x=0,color='k', ls = '--')
ax.axhline(y=0,color='k', ls = '--')

s = 300   # this will scale the size of the arrow plot

for J in [4,9,11,12,14,3,12]:

    x = v[J,0]    
    y = v[J,1]  

    # make an arrow from the origin to a point at (x,y)
    ax.arrow(0,0,s*x,s*y,color='black',width=1)
    ax.text(x*s*1.5,y*s*1.1,labels_new[J],fontsize=28)

In [None]:
#clear

g = sns.lmplot(x = "pc1", y = "pc2", data = df2[df2['Position'] == 'DEF'], hue = "Position", fit_reg=False, height=11, aspect=2, legend=True,
            markers=["s"],palette=dict(DEF="m"))
ax = g.axes[0,0]
ax.axvline(x=0,color='k', ls = '--')
ax.axhline(y=0,color='k', ls = '--')


s = 300   # this will scale the size of the arrow plot

for J in [4,9,11,12,14,3,12]:

    x = v[J,0]    
    y = v[J,1]  

    # make an arrow from the origin to a point at (x,y)
    ax.arrow(0,0,s*x,s*y,color='black',width=1)
    ax.text(x*s*1.5,y*s*1.1,labels_new[J],fontsize=28)

## Example 3: Food consumption in UK (using sklearn)

Example from:
http://setosa.io/ev/principal-component-analysis/

In [None]:
data = pd.io.parsers.read_csv("additional_files/UK_foods.csv")
data

<img src="figures/Picture1.png" width="500"/>

In [None]:
headers = data['Unnamed: 0'].values.tolist()
print(headers)

new_data = data.drop(['Unnamed: 0'], axis=1)
new_data.head()

regions = new_data.columns.values.tolist()
print(regions)

In [None]:
food = pd.DataFrame(new_data.values.T,columns=headers)
food['region'] = regions
food

In [None]:
food.std()

### This is when we want to try PCA!

In [None]:
#Performing PCA without scaling the data (to match the results from the website)
#X = pd.DataFrame(food[headers], columns=headers)
# In general, PCA scales the variables to zero-mean (use line below to scale)
X = pd.DataFrame(scale(food[headers]), columns=headers)

In [None]:
X.head()

In [None]:
pca = PCA().fit(X)
pca_samples = pca.transform(X)

In [None]:
var_exp = pca.explained_variance_ratio_
plt.bar(range(len(var_exp)),var_exp, align='center', label='individual explained variance');
plt.ylabel('Explained variance ratio');
plt.xlabel('Principal components');

In [None]:
components = pd.DataFrame(pca.components_, columns = headers) 

In [None]:
plt.figure()
plt.bar(headers,components.values[0])
plt.xticks(rotation=90)
plt.title('influence of original variables(food) upon pc1')
plt.figure()
plt.bar(headers,components.values[1])
plt.xticks(rotation=90)
plt.title('influence of original variables(food) upon pc2')

In [None]:
Xstar = pd.DataFrame(pca_samples,columns=['pc1','pc2','pc3','pc4'])
Xstar['region'] = regions
Xstar

In [None]:
sns.stripplot(x="pc1",y="region", data=Xstar, jitter=0.05, linewidth=1)

In [None]:
ax = plt.figure()
ax = sns.lmplot('pc1', 'pc2',Xstar,hue='region', fit_reg=False)
plt.axis('equal')
plt.xlabel('pc1')
plt.ylabel('pc2')