### Topic: clustering the result of Comprehensive Capital’s supervisory stress testing 

##### Writer: Elaheh Esfandi .

In [None]:
# data entry
library(readxl)
senario_3_2019<- read_excel("C:/Users/SONY/.ipynb_checkpoints/senario_3_2019.xlsx")
senario_2_2019<- read_excel("C:/Users/SONY/.ipynb_checkpoints/senario_2_2019.xlsx")
senario_3_2018<- read_excel("C:/Users/SONY/.ipynb_checkpoints/senario_3_2018.xlsx")
senario_2_2018<- read_excel("C:/Users/SONY/.ipynb_checkpoints/senario_2_2018.xlsx")

## data standardization
data of 2018 are train data set(senario-2-2018 and senario-3-2018)

In [None]:
data.stand.2 <- scale(senario_2_2018[,-1]) # To standarize the variables
data.stand.2 <- as.data.frame(data.stand.2)
rownames(data.stand.2) <- senario_2_2018$disclosure_legal_name
#####
data.stand.3 <- scale(senario_3_2018[,-1]) # To standarize the variables
data.stand.3 <- as.data.frame(data.stand.3)
rownames(data.stand.3) <- senario_3_2018$disclosure_legal_name

In [None]:
library(dplyr)
library(Hmisc)

In [None]:
hist.data.frame(senario_2_2018[,-1])

In [None]:
hist.data.frame(senario_3_2018[,-1])

In [None]:
hist.data.frame(data.stand.2)

In [None]:
hist.data.frame(data.stand.3)

## First model: PCA

With package vegan we can use MDS as PCA when distances are euclidean and plot a bi-plot like in PCA

In [None]:
#standardise the variables
library(psych)
pairs.panels(data.stand.2,
             gap = 0,
             bg = c("red", "yellow", "blue")[senario_2_2018$common_equity_tier1_actual_rat],
             pch=21)

pairs.panels(data.stand.3,
             gap = 0,
             bg = c("red", "yellow", "blue")[senario_3_2018$common_equity_tier1_actual_rat],
             pch=21)

pca.2 <- prcomp(data.stand.2[,1:8],
             center = TRUE,
             scale. = TRUE)
summary(pca.2)
sum((pca.2$sdev)^2)

pca.3 <- prcomp(data.stand.3[,1:8],
             center = TRUE,
             scale. = TRUE)
summary(pca.3)
sum((pca.3$sdev)^2)


In [None]:
#Deciding How Many Principal Components to Retain
screeplot(pca.2, type="lines")

In [None]:
screeplot(pca.3, type="lines")

In [None]:
#To calculate the values of the first principal component
calcpc <- function(variables,loadings)
{
  # find the number of samples in the data set
  as.data.frame(variables)
  numsamples <- nrow(variables)
  # make a vector to store the component
  pc <- numeric(numsamples)
  # find the number of variables
  numvariables <- length(variables)
  # calculate the value of the component for each sample
  for (i in 1:numsamples)
  {
    valuei <- 0
    for (j in 1:numvariables)
    {
      valueij <- variables[i,j]
      loadingj <- loadings[j]
      valuei <- valuei + (valueij * loadingj)
    }
    pc[i] <- valuei
  }
  return(pc)
}
calcpc(data.stand.2, pca.2$x[,1:3])

In [None]:
#senario 2
pca.2$x[,1:3]

pca.2$rotation[,1:3]

plot(pca.2$x[,1],pca.2$x[,2]) # make a scatterplot
text(pca.2$x[,1],pca.2$x[,2], senario_2_2018$disclosure_legal_name, cex=0.7, pos=4, col="blue") # add labels

In [None]:
#senario 3
calcpc(data.stand.3, pca.3$x[,1:3])

pca.3$x[,1:3]

pca.3$rotation[,1:3]

plot(pca.3$x[,1],pca.3$x[,3]) # make a scatterplot
text(pca.3$x[,1],pca.3$x[,3], senario_3_2018$disclosure_legal_name, cex=0.7, pos=4, col="blue") # add labels

In [None]:
library(corrplot) 
cor.temp <- cor(data.stand.2)
corrplot2 <- corrplot.mixed(cor(data.stand.2), upper = 'circle', 
                            lower = "number",
                            tl.pos = "lt", 
                            tl.col = "black",
                            tl.cex = 0.8,
                            addCoefasPercent = TRUE,
                            number.cex=0.8)

In [None]:
cor.temp <- cor(data.stand.3)
corrplot3 <- corrplot.mixed(cor(data.stand.3), upper = 'circle', 
                            lower = "number",
                            tl.pos = "lt", 
                            tl.col = "black",
                            tl.cex = 0.8,
                            addCoefasPercent = TRUE,
                            number.cex=0.8)

## second method : Classical MDS
second method : Classical MDS
N rows (objects) x p columns (variables) each row identified by a unique row name

In [None]:
# senario 2:
d <- dist(data.stand.2) # euclidean distances between the rows
fit1 <- cmdscale(d,eig=TRUE, k=9) # k is the number of dim
fit1 # view results

In [None]:
# plot solution
x <- fit1$points[,1]
y <- fit1$points[,2]
ggplot() + geom_point(data = as.data.frame(data.stand.2) , mapping = aes(x=x, y=y), color = "blue", alpha = 0.5) 
+ labs(title = "figure4: MDS configuration of Japan Prefectures")

In [None]:
# Nonmetric MDS
# N rows (objects) x p columns (variables)
# each row identified by a unique row name

library(MASS)
l <- dist(data.stand.2) # euclidean distances between the rows
fit2 <- isoMDS(l, k=9) # k is the number of dim
fit2 # view results

In [None]:
# plot solution
x <- fit2$points[,1]
y <- fit2$points[,2]

ggplot() + geom_point(data = as.data.frame(data.stand.2) , mapping = aes(x=x, y=y), color = "blue", alpha = 0.5) 
+ labs(title = "figure4: MDS configuration of Japan Prefectures")

In [None]:
# senario 3:
d3 <- dist(data.stand.3) # euclidean distances between the rows
fit3 <- cmdscale(d3,eig=TRUE, k=9) # k is the number of dim
fit3 # view results

In [None]:
# plot solution
x <- fit3$points[,1]
y <- fit3$points[,2]

ggplot() + geom_point(data = as.data.frame(data.stand.3) , mapping = aes(x=x, y=y), color = "blue", alpha = 0.5) 
+ labs(title = "figure4: MDS configuration of Japan Prefectures")

In [None]:
# Nonmetric MDS
# N rows (objects) x p columns (variables)
# each row identified by a unique row name

library(MASS)
l3 <- dist(data.stand.3) # euclidean distances between the rows
fit4 <- isoMDS(l3, k=9) # k is the number of dim
fit4 # view results

In [None]:
# plot solution
x <- fit4$points[,1]
y <- fit4$points[,2]

ggplot() + geom_point(data = as.data.frame(data.stand.3) , mapping = aes(x=x, y=y), color = "blue", alpha = 0.5) 
+ labs(title = "figure4: MDS configuration of Japan Prefectures")

In [None]:
perf = substr(x = as.character(senario_3_2018[,1]), start = 1, stop = 3)

ggplot() + geom_point(data = as.data.frame(data.stand.3) , mapping = aes(x = x, y =y), color = "blue", alpha = 0.5) 
+ geom_text_repel(data =data.frame(perf ,data.stand.3), mapping = aes(x=x, y=y , label = data.stand.3[,1])))


In [None]:
#professor way

require(vegan)
data.stand.2.a<- na.omit(data.stand.2)
mode<- capscale(data.stand.2.a~1)
summary(mode)
plot(mode)

mod<-prcomp(data.stand.2.a)
biplot(mod)

data.stand.3.a<- na.omit(data.stand.3)
mo3<- capscale(data.stand.3.a~1)
summary(mo3)
plot(mo3)

In [None]:
mod3<-prcomp(data.stand.3.a)
biplot(mod3)

## the thired method: Canonical correlation

In [None]:
#1 way: professor way 
library("CCA")
library(heplots)
library(ggplot2)
library(GGally)
library(CCP)

In [None]:
x<-as.matrix(data.stand.2[,1:8])
y<-as.matrix(data.stand.3[,1:8])
if(require(corrplot)){
  M<-cor(cbind(x,y))
  corrplot(M, method="ellipse", order="hclust", addrect=2, addcoef.col="black")
}
(cc<-cancor(x,y))


In [None]:
# tests of canonical dimensions
rho <- cancor(x,y)$cor
N = dim(x)[1]       
p = dim(x)[2]   
q = dim(y)[2]
## Calculate p-values using the F-approximations of different test statistics:

p.asym(rho, N, p, q, tstat = "Wilks")
p.asym(rho, N, p, q, tstat = "Hotelling")
p.asym(rho, N, p, q, tstat = "Pillai")
p.asym(rho, N, p, q, tstat = "Roy")

## Plot the F-approximation for Wilks' Lambda, considering 3, 2, or 1 canonical correlation(s):
res1 <- p.asym(rho, N, p, q)

In [None]:
plt.asym(res1,rhostart=1)

In [None]:
plt.asym(res1,rhostart=2)

In [None]:
plt.asym(res1,rhostart=3)

In [None]:
plt.asym(res1,rhostart=4)

In [None]:
plt.asym(res1,rhostart=5)

In [None]:
plt.asym(res1,rhostart=6)

In [None]:
# standardized psych canonical coefficients diagonal matrix of psych sd's , senario 2
s1 <- diag(sqrt(diag(cov(data.stand.2[,1:8]))))
s1 %*% cc$xcoef

In [None]:
# standardized acad canonical coefficients diagonal matrix of acad sd's, senario 3
s2 <- diag(sqrt(diag(cov(data.stand.3[,1:8]))))
s2 %*% cc$ycoef

In [None]:
correl <- matcor(data.stand.2, data.stand.3 )
img.matcor(correl, type = 2)

In [None]:
##2 way : function for the R package 'CCA'
cc2 <- cc(data.stand.2, data.stand.3 )      ### function for the R package 'CCA'
cc2$cor  ### function for the R package 'CCA'

In [None]:
cc2$xcoef  ### function for the R package 'CCA'

In [None]:
cc2$ycoef

In [None]:
par(mfrow = c(1,2))

plt.cc(cc2, var.label = TRUE, ind.names = data[,1])