# Study of PCA/LDA

What can we learn from PCA of dataset before and after feature selection?

In [35]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import scipy.io as sio
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

%matplotlib notebook

In [36]:
for subject in range(1,6):
    matOut = {}
    matFile = 'S' + str(subject) + '_pca.mat'
    
    # load subject data
    emgHD = sio.loadmat('./hw_encode/S' + str(subject) + 'E1.mat')['emgHD']
    
    # collect all features, ngrams, and labels
    numGest, numTrial = emgHD.shape
    numCh = emgHD[0][0][3].shape[1]
    numEx = emgHD[0][0][3].shape[0]
    N = 5
    features = np.empty((numCh*N,0))
    featureLabels = np.empty(0)
    for i in range(numGest):
        for j in range(numTrial):
            x = emgHD[i][j][3].T
            f = np.empty((0,numEx-N+1))
            for n in range(N):
                f = np.concatenate((f,x[:,n:(80-N+1+n)]),axis=0)
            features = np.concatenate((features,f),axis=1)
            featureLabels = np.concatenate((featureLabels,i*np.ones(numEx-N+1)))
      
    # arrange data into a pandas dataframes for interface with sklearn
    featCols = ['feature' + str(i) for i in range(features.shape[0])]
    featDF = pd.DataFrame(features.T,columns=featCols)
    
    dim = emgHD[0][0][4].shape[0]
    numEx = emgHD[0][0][4].shape[1]
    ngrams = np.empty((dim,0))
    ngramLabels = np.empty(0)
    for i in range(numGest):
        for j in range(numTrial):
            ngrams = np.concatenate((ngrams,emgHD[i][j][4]),axis=1)
            ngramLabels = np.concatenate((ngramLabels,i*np.ones(numEx)))
        
    # arrange data into a pandas dataframes for interface with sklearn
    hvCols = ['hv' + str(i) for i in range(ngrams.shape[0])]
    hvDF = pd.DataFrame(ngrams.T,columns=hvCols)

    # run PCA
    featPCA = PCA(n_components=len(featCols))
    featResults = featPCA.fit_transform(featDF.values)
    hvPCA = PCA(n_components=len(hvCols))
    hvResults = hvPCA.fit_transform(hvDF.values)
    
    matOut['featPCA'] = featResults
    matOut['hvPCA'] = hvResults
    
    featEV = featPCA.explained_variance_ratio_
    featEVCum = np.cumsum(featEV)
    hvEV = hvPCA.explained_variance_ratio_
    hvEVCum = np.cumsum(hvEV)
    
    matOut['featPCAEV'] = featEV;
    matOut['hvPCAEV'] = hvEV;
    
#     plt.figure()
#     plt.plot(np.arange(1,len(featEV)+1)/len(featEV),featEV)
#     plt.plot(np.arange(1,len(featEV)+1)/len(featEV),featEVCum)
#     plt.plot(np.arange(1,len(hvEV)+1)/len(hvEV),hvEV)
#     plt.plot(np.arange(1,len(hvEV)+1)/len(hvEV),hvEVCum)

#     plt.show()
    
    # run LDA
    featLDA = LDA(n_components=12)
    featResults = featLDA.fit_transform(featDF.values,featureLabels)
    
    matOut['feat'] = features.T
    matOut['featLabel'] = featureLabels
    matOut['featLDA'] = featResults
    
    hvLDA = LDA(n_components=12)
    hvResults = hvLDA.fit_transform(hvDF.values,ngramLabels)
    
    matOut['hv'] = ngrams.T
    matOut['hvLabel'] = ngramLabels
    matOut['hvLDA'] = hvResults
    sio.savemat(matFile,matOut)
    
    