In [None]:
# in this notebook we compute the variance explained
# by the linear sparce PCA and the SDP sparce pca
# in a various datasets.

In [None]:
# to compute the variance explained, we follow the paper
#"https://www.sciencedirect.com/science/article/pii/S0169743919303636"

<h1>functions and evaluation of explained variance by the SDP for sparse PCA  and its linear relaxation<h1>

<h1>Table of contents <h1>   
    0.    Load packages, load sparsePCArelaxations notebook, Functions.<br>
    1. Code for figure 5 <br>
    

<h2>0.  Load packages, load sparsePCArelaxations notebook, Functions.<h2>

In [1]:
using RDatasets, NBInclude, MultivariateStats, StatsBase
using Statistics,  Plots, DataFrames

In [2]:
@nbinclude("sparsePCArelaxations.ipynb")


EigenRelSparsePCA (generic function with 1 method)

<h3>Functions<h3>

In [3]:
#function to compute the variance explained by sparse PCAs computed using the semidefinite relaxation.
# k is a vector where k[i] is the target sparcity of pc i. Length of k is the number of PCs.
#@param int k: the target sparcity.
#@param dataframe data:  the data set to be used.
#@param bool normalize: variable that indicates if we should normalize the data or not. 
#@returns the variance explained by the SDP, as computed in
# https://www.sciencedirect.com/science/article/pii/S0169743919303636"

function computeVarianceSDP(k,data,normalize::Bool)
#currentCovMatrix = cor(data)    
  n = size(data)[1]   
    
  if normalize == true
    
    currentCovMatrix = (1/(n-1))* scattermat(data) 
        
    else
        currentCovMatrix = scattermat(data) 
    end    
    
    dims = size(currentCovMatrix)[1]
     
numPc = length(k)   
#sdp 
  SdpSol=SdpSparsePCA(currentCovMatrix,k[1])[1]
currentPc =eigen(SdpSol).vectors[:,dims]
PCs = currentPc  
 currentCovMatrix = currentCovMatrix-(transpose(currentPc)*currentCovMatrix*currentPc)*currentPc*transpose(currentPc)
 for i in 2:numPc
     
SdpSol=SdpSparsePCA(currentCovMatrix,k[i])[1]        
currentPc =eigen(SdpSol).vectors[:,dims] 
PCs = hcat(PCs,currentPc) 
    
currentCovMatrix = currentCovMatrix-(transpose(currentPc)*currentCovMatrix*currentPc)*currentPc*transpose(currentPc)    
    end
    
PCs= round.(PCs,digits=4)
 That = data*PCs*pinv(transpose(PCs)*PCs) 
    
   # return(That)
VarianceExplained = tr(PCs*transpose(That)*That*transpose(PCs))/tr(data*transpose(data))
   
 return(VarianceExplained)   
end

computeVarianceSDP (generic function with 1 method)

In [4]:
#function to compute the variance explained by sparse PCAs computed using the linear relaxation.
# k is a vector where k[i] is the target sparcity of pc i. Length of k is the number of PCs.
#@param int k the target sparcity .
#@param dataframe data: is the data set to be used.
#@param bool normalize: variable indicates if we should normalize the data or not. 
#@returns the variance explained by the LP, as computed in
# https://www.sciencedirect.com/science/article/pii/S0169743919303636"
function computeVarianceLP(k,data,normalize::Bool)
#currentCovMatrix = cor(data)
 n = size(data)[1]   
 
    if normalize == true
    
    currentCovMatrix = (1/(n-1))* scattermat(data) 
        
    else
        currentCovMatrix = scattermat(data) 
    end
        
dims = size(currentCovMatrix)[1]
numPc = length(k)   
#sdp 
  lpSol=EigenRelSparsePCA(currentCovMatrix,k[1])[1]
currentPc =eigen(lpSol).vectors[:,dims]
PCs = currentPc
 
 currentCovMatrix = currentCovMatrix-(transpose(currentPc)*currentCovMatrix*currentPc)*currentPc*transpose(currentPc)
 for i in 2:numPc
     
lpSol=EigenRelSparsePCA(currentCovMatrix,k[i])[1]        
currentPc =eigen(lpSol).vectors[:,dims] 
PCs = hcat(PCs,currentPc) 
    
currentCovMatrix = currentCovMatrix-(transpose(currentPc)*currentCovMatrix*currentPc)*currentPc*transpose(currentPc)    
    end
    
PCs= round.(PCs,digits=4)
 That = data*PCs*pinv(transpose(PCs)*PCs) 
    
   # return(That)
VarianceExplained = tr(PCs*transpose(That)*That*transpose(PCs))/tr(data*transpose(data))
   
 return(VarianceExplained)   
end

computeVarianceLP (generic function with 1 method)

In [5]:
# fucntion to curate the datasets we will use, and keep only the real variables.
#@param int min_number_covariates: the minimum number of real covariates that the dataset must contain.
# if the number of covariantes in a dataset is less than this number, the dataset is discarded.
# @param int max_number_covariates: the maximum number of covariates to consider.
function processDataset(min_number_covariates,max_number_covariates)
    
    datas =  RDatasets.datasets()
    num_data_tables = size(datas)[1]
    datas[!,:num_float_vars]= zeros(size(datas)[1])

      
for i in 1:  num_data_tables
        
     currentTable = dataset(datas[i,1], datas[i,2])
     currentTable = select(currentTable, findall(col -> eltype(col) <: Float64, eachcol(currentTable)))
      datas[i,size(datas)[2]]= size(currentTable)[2]   
     end
    
  datas =  datas[datas.num_float_vars .> min_number_covariates, :]
   datas = datas[datas.num_float_vars .<=max_number_covariates,:]

    
    return(datas)
end



processDataset (generic function with 1 method)

In [6]:
# function to give the value of the variance explained, fixing
# the same vectors for both the SDP and the LP.
#@param string datasetName : name of the dataset to use.
#@param string tableName: name of the table to use.
#@param vector sparcityPatern: the i-th entry of sparcityPatern corresponds to the target.
#@param sparcity of the i-th component.
#@param bool normalize: indicates if we should normalize the data or not. 
function give_variances(datasetName::String,tableName::String,sparcityPatern,normalize::Bool)
   
    num_Pcs = length(sparcityPatern) 
    
    dat = dataset(datasetName, tableName)
    dat = select(dat, findall(col -> eltype(col) <: Float64, eachcol(dat)))
    dat =Matrix(dat)    
    
    #Careful, for the pca, observations must be in columns. For our methods, our observations
    # are in rows. Also, the fit function regularizes the matrix by default
    
    # normalize the data
    dat = mapslices(x -> x.-mean(x), dat, dims=1)
    
    dat_for_normal_PCA = transpose(dat)
    totalVarExplainedPCA = principalratio(fit(PCA,iris2Reg,maxoutdim=num_Pcs)) 
     return( computeVarianceLP(sparcityPatern,dat,normalize),  computeVarianceSDP(sparcityPatern,dat),totalVarExplainedPCA)
end


#function to return the best variance explained found by altering the sparcity target. 
#@param string datasetName : name of the dataset to use.
#@param string tableName: name of the table to use.
 #@param bool normalize: indicates if we should normalize the data or not. 
function findBestKoneComponent(datasetName::String,tableName::String,normalize::Bool)
   
    dat = dataset(datasetName, tableName)
    dat = select(dat, findall(col -> eltype(col) <: Float64, eachcol(dat)))
    dat =Matrix(dat)
    dat = mapslices(x -> x.-mean(x), dat, dims=1)

    numVars = size(dat)[2]
    max_sparcity = Int64(floor(numVars/2))+1
    bestLP = 0
    bestSDP = 0
    
    #compute the pca explained variance
    
    
    dat_for_normal_PCA = transpose(dat)
    totalVarExplainedPCA = principalratio(fit(PCA,dat_for_normal_PCA,maxoutdim=1)) 
    
    
      for i in 1:max_sparcity
        bestLP = max(bestLP, computeVarianceLP([i,i,i,i],dat,normalize))
        bestSDP =  max(bestSDP, computeVarianceSDP([i,i,i,i],dat,normalize))
    end
    return(bestLP,bestSDP,totalVarExplainedPCA)
end



#function to plot the best variance explained against different datasets
# we take p datasets of the RDatasets, with at least 10 variables.
# param dataSet dataset containing the names of tables to be used to computes variance explained
# by the pca, SDP sparce plca and LP sparce PCA.
# We assume dataSet was selected using the processDataset function.
function explained_variances(dataSet::Any)
    
    number_tables, number_cols= size(dataSet)
    
    computed_variances = zeros(number_tables,3)
    
     
    for i in 1: number_tables
 
   computed_variances[i,:] .=  findBestKoneComponent(String(dataSet[i,1]),String(dataSet[i,2]),false)
    end 
   return(computed_variances)  

end




explained_variances (generic function with 1 method)

<h2>    1. Code for figure 5 <h2>

In [7]:
# first select the datasets to use. We want between 8 and 20 float64 covariates.
datasets_to_use = processDataset(8,20)


└ @ RData C:\Users\danis\.julia\packages\RData\OT7M6\src\convert.jl:118


Unnamed: 0_level_0,Package,Dataset,Title
Unnamed: 0_level_1,String15,String31,String
1,Ecdat,Forward,Exchange Rates of US Dollar Against Other Currencies
2,Ecdat,Klein,Klein's Model I
3,Ecdat,MedExp,Structure of Demand for Medical Care
4,HSAUR,pottery,Romano-British Pottery Data
5,MASS,fgl,Measurements of Forensic Glass Fragments
6,mlmRev,bdf,Language Scores of 8-Graders in The Netherlands
7,psych,Holzinger.9,Seven data sets showing a bifactor solution.
8,psych,Thurstone.33,Seven data sets showing a bifactor solution.
9,psych,Thurstone,Seven data sets showing a bifactor solution.
10,psych,Tucker,9 Cognitive variables discussed by Tucker and Lewis (1973)


In [None]:
#compute the variances explained in each dataset.
variances_results = explained_variances(datasets_to_use)

In [None]:
#take the name of each dataset to use as label
labelsx = datasets_to_use.Dataset

In [None]:
#Plot the results.
variancesDifferencesnopca = zeros(40)
for i in 1:40
    variancesDifferencesnopca[i] =100*(variances_results[i,2]-variances_results[i,1])/variances_results[i,2]
end
plot(scatter(variancesDifferencesnopca,xticks=(1:40,labelsx),xrotation = 90,labels="Percentual_error",markershape = :diamond,markercolor = :black,legend=:topleft))
