In [None]:
# Display the current working directory
getwd();
# If necessary, change the path below to the directory where the data files are stored.
# "." means current directory. On Windows use a forward slash / instead of the usual \.
workingDir = "/home/jovyan/work/phd/datasets/tcga//oversampling_10tissue";
setwd(workingDir);
# Load WGCNA package
library(WGCNA)
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);

In [None]:
datExpr <- t(read.csv("mainTable_all.csv", check.names=FALSE, row.names = ""))
genes <- dimnames(datExpr)[[2]]
samples <- dimnames(datExpr)[[1]]
datExpr <- as.data.frame(apply(datExpr,2, as.numeric)) #numeric
dimnames(datExpr)[[1]] <- samples
allTraits <- read.csv("files.dat")

In [None]:
log1 <- function(x){
    log(x+1)
}
datExpr <- as.data.frame(apply(datExpr,2, log1))
dimnames(datExpr)[[1]] <- samples

In [None]:
datExpr[1:10,1:5]

In [None]:
head(allTraits)

In [None]:
labels <- c('primary_site', 'malignacy', 'disease_type', 'stage')
#labels <- c('primary_site', 'secondary_site')
datTraits <- allTraits[,labels]
rownames(datTraits) <- allTraits$file_name
datTraits <- datTraits[rownames(datExpr),]
head(datTraits)

In [None]:
table( dimnames(datExpr)[[1]]==datTraits$file_name)

In [None]:
y = datTraits$primary_site

In [None]:
meanExpressionBySample=apply( datExpr,1,mean, na.rm=T)
NumberMissingBySample=apply( is.na(data.frame(datExpr)),1, sum)

## 1

In [None]:
gsg = goodSamplesGenes(datExpr, verbose = 3);
gsg$allOK

In [None]:
sampleTree = hclust(dist(datExpr), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
sizeGrWindow(12,9)
svg("samplehier.svg")
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
cex.axis = 1.5, cex.main = 2)
dev.off()

In [None]:
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 3000000, minSize = 10)
table(clust)

In [None]:
# clust 1 contains the samples we want to keep.
keepSamples = (clust==1)
datExpr = datExpr[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

genes <- dimnames(datExpr)[[2]]
samples <- dimnames(datExpr)[[1]]
datExpr <- as.data.frame(apply(datExpr,2, as.numeric)) #numeric
dimnames(datExpr)[[1]] <- samples

In [None]:
head(datExpr)

In [None]:
rownames(allTraits) <- allTraits$file_name
datTraits <- allTraits[keepSamples,labels]
rownames(datTraits) <- rownames(allTraits[keepSamples,labels])
datTraits <- datTraits[rownames(datExpr),]
head(datTraits)

In [None]:
collectGarbage();

In [None]:
# Re-cluster samples
sampleTree2 = hclust(dist(datExpr), method = "average")
fontsize=.5
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = labels2colors(datTraits);
# Plot the sample dendrogram and the colors underneath.
pdf("samplehiercut.pdf")
plotDendroAndColors(sampleTree2, traitColors,
    groupLabels = labels,
    main = "Sample dendrogram and trait heatmap",
    dendroLabels=FALSE,
    rowText=datTraits,
    rowTextIgnore=labels[[2]]
    )
dev.off()

## 2

In [None]:
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
sizeGrWindow(9, 5)
pdf("scaletopology.pdf")
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()

In [None]:
net = blockwiseModules(datExpr, power = 6,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.15,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "wgcna",
verbose = 3)

In [None]:
# open a graphics window
sizeGrWindow(12, 9)
pdf("netdendograms.pdf")
# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()

In [None]:
moduleLabels = net$colors
moduleColors = labels2colors(net$colors)
MEs = net$MEs;
geneTree = net$dendrograms[[1]];

## 3

In [None]:
datTraits$primary_site <- match(datTraits$primary_site, unique(datTraits$primary_site))
#datTraits$secondary_site <- match(datTraits$secondary_site, unique(datTraits$secondary_site))
datTraits$malignacy <- match(datTraits$malignacy, unique(datTraits$malignacy))
datTraits$disease_type <- match(datTraits$disease_type, unique(datTraits$disease_type))
datTraits$stage <- match(datTraits$stage, unique(datTraits$stage))

In [None]:
# Define numbers of genes and samples
nGenes = ncol(datExpr);
nSamples = nrow(datExpr);
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = cor(MEs, datTraits, use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);

In [None]:
pdf("moduleTrait.pdf", width=9, height=18)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = labels,
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()

In [None]:
# Define variable weight containing the weight column of datTrait
weight = as.data.frame(datTraits$stage);
names(weight) = "weight"
# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(weight), sep="");
names(GSPvalue) = paste("p.GS.", names(weight), sep="");

In [None]:
module = "greenyellow"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
pdf("modulemembership.pdf")
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
    abs(geneTraitSignificance[moduleGenes, 1]),
    xlab = paste("Module Membership in", module, "module"),
    ylab = "Gene significance for primary_site",
    main = paste("Module membership vs. gene significance\n"),
    cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()

In [None]:
# Create the starting data frame
geneInfo0 = data.frame(
moduleColor = moduleColors,
geneTraitSignificance,
GSPvalue)
# Order modules by their significance for weight
modOrder = order(-abs(cor(MEs, weight, use = "p")));
# Add module membership information in the chosen order
for (mod in 1:ncol(geneModuleMembership))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]],
MMPvalue[, modOrder[mod]]);
names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
# Order the genes in the geneInfo variable first by module color, then by geneTraitSignificance
geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));
geneInfo = geneInfo0[geneOrder, ]
write.csv(geneInfo, file = "geneInfo.csv")

In [None]:
# Read in the probe annotation
annot = read.csv(file = "geneInfo.csv");
# Match probes in the data set to the probe IDs in the annotation file
probes = names(datExpr)
allLLIDs = probes
# $ Choose interesting modules
intModules = c("greenyellow", "darkorange", "darkgreen")
for (module in intModules)
{
# Select module probes
modGenes = (moduleColors==module)
# Get their entrez ID codes
modLLIDs = probes[modGenes];
# Write them into a file
fileName = paste("LocusLinkIDs-", module, ".txt", sep="");
write.table(as.data.frame(modLLIDs), file = fileName,
row.names = FALSE, col.names = FALSE, quote=FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("LocusLinkIDs-all.txt", sep="");
write.table(as.data.frame(allLLIDs), file = fileName,
row.names = FALSE, col.names = FALSE, quote=FALSE)

In [None]:
require("biomaRt")
mart <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset("hsapiens_gene_ensembl", mart)

ens <- allLLIDs
ensLookup <- gsub("\\.[0-9]*$", "", ens)

annotLookup <- getBM(
  mart=mart,
  attributes=c("ensembl_transcript_id", "ensembl_gene_id", "gene_biotype", "external_gene_name"),
  filter="ensembl_gene_id",
  values=ensLookup,
  uniqueRows=TRUE)

annotLookup <- data.frame(
  ens[match(annotLookup$ensembl_gene_id, ensLookup)],
  annotLookup)

colnames(annotLookup) <- c(
  "original_id",
  c("ensembl_transcript_id", "ensembl_gene_id", "gene_biotype", "external_gene_name"))

annotLookup

## 5 

In [None]:
# Re-cluster samples
sampleTree3 = hclust(dist(MEs0), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = labels2colors(datTraits);
# Plot the sample dendrogram and the colors underneath.
svg("nethier.svg", width=10)
plotDendroAndColors(sampleTree3, traitColors,
groupLabels = labels,
main = "Network dendrogram and trait heatmap",
dendroLabels=FALSE,
rowText=allTraits[keepSamples,labels],
rowTextIgnore=labels[[2]])
dev.off()

In [None]:
clust = cutreeStatic(sampleTree3, cutHeight = 0.4, minSize = 10)
table(clust)

In [None]:
# Re-cluster samples after cut
sampleTree4 = hclust(dist(MEs0[(clust==1),]), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = labels2colors(datTraits[(clust==1),]);
# Plot the sample dendrogram and the colors underneath.
svg("nethiercut.svg")
plotDendroAndColors(sampleTree4, traitColors,
groupLabels = labels,
main = "Network dendrogram and trait heatmap",
dendroLabels=FALSE)
dev.off()

In [None]:
# Calculate topological overlap anew: this could be done more efficiently by saving the TOM
# calculated during module detection, but let us do it again here.
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);
# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function
sizeGrWindow(9,9)
svg("networkmap.svg")
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
dev.off()

In [None]:
#selct genes to be faster
nSelect = nGenes*0.7
# For reproducibility, we set the random seed
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# Open a graphical window
sizeGrWindow(9,9)
svg("networkmap.svg")
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
dev.off()

In [None]:
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
# Isolate weight from the clinical traits
weight = as.data.frame(datTraits$primary_site);
names(weight) = "primary_site"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, weight))
# Plot the relationships among the eigengenes and the trait
sizeGrWindow(5,7.5);
svg("eigenHeat.svg")
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)
dev.off()

In [None]:
# Plot the dendrogram
sizeGrWindow(6,6);
svg("eigendendogram.svg")
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
plotDendrograms = FALSE, xLabelsAngle = 90)
dev.off()

In [None]:
write.csv(cutree(sampleTree3, k=6), "wgcna_level_2_labels.csv")