Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 78b56b6
Showing
18 changed files
with
1,052 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
Package: COUSCOus | ||
Type: Package | ||
Title: A Residue-Residue Contact Detecting Method | ||
Version: 1.0.0 | ||
Date: 2016-02-17 | ||
Author: Reda Rawi [aut,cre], Matyas A. Sustik [aut], Ben Calderhead [aut] | ||
Maintainer: Reda Rawi <rrawi@qf.org.qa> | ||
Description: Contact prediction using shrinked covariance (COUSCOus). COUSCOus is a residue-residue contact detecting method approaching the contact inference using the glassofast implementation of Matyas and Sustik (2012, The University of Texas at Austin UTCS Technical Report 2012:1-3. TR-12-29.) that solves the L_1 regularised Gaussian maximum likelihood estimation of the inverse of a covariance matrix. Prior to the inverse covariance matrix estimation we utilise a covariance matrix shrinkage approach, the empirical Bayes covariance estimator, which has been shown by Haff (1980) <DOI:10.1214/aos/1176345010> to be the best estimator in a Bayesian framework, especially dominating estimators of the form aS, such as the smoothed covariance estimator applied in a related contact inference technique PSICOV. | ||
Repository: CRAN | ||
License: GPL (>= 3) | ||
Imports: bio3d (>= 2.2-2), matrixcalc (>= 1.0-3), utils (>= 3.2.2) | ||
Depends: R (>= 3.2.2) | ||
NeedsCompilation: yes | ||
Packaged: 2016-02-28 13:00:30 UTC; reda | ||
Date/Publication: 2016-02-28 16:26:52 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
9d859178e2da0036c7a37e6c6cd087b8 *DESCRIPTION | ||
83467b3dd14b1093ec8b0fbd5f030141 *NAMESPACE | ||
e032f7efde9755a16ede37bcc01ec88c *R/COUSCOus.R | ||
b87ec2f0cf15d20daedf954a5258fce1 *R/helper_precision.R | ||
0fc57b47a097d6402fa47a204a4e8f80 *R/helper_prediction.R | ||
c55d70ac61c6c70b74b4e67ca08a37d4 *R/helper_preprocess.R | ||
a903d57ea86a6734d17ac528913c4f36 *R/helper_shrinkage.R | ||
1652cc1d25db76e10a646a2cb765d7e7 *inst/examples/1oaiA0.fa | ||
ddcf71ef6c3ef5b55e9a4d7fdb0d2e52 *man/COUSCOus-internal.Rd | ||
ead3312208b2559bfd5cdccf017e41a5 *man/COUSCOus.Rd | ||
d14c3df549b301bb80d6caf1ccb5bb22 *src/aa2num.c | ||
e9b16a0eab395a18f689de6f1259f615 *src/calculate_pair_site_frequencies.c | ||
0db0659a516d4398256bc5b7e10ac5da *src/calculate_sequence_weights.c | ||
7fc25371ef6225458a8ed74b1edd55b6 *src/calculate_single_site_frequencies.c | ||
aa6df00a6912978180ab8354750bd936 *src/form_covarience_matrix.c | ||
1cc01dbbe27f2eff9a41b62b6d636689 *src/glassofast.f90 | ||
710e5a462164809453e43dfe1af3ca0e *src/guess_rho_matrix.c |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
useDynLib(COUSCOus) | ||
exportPattern("^[[:alpha:]]+") | ||
import("bio3d","matrixcalc","utils") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#================================================== | ||
#================================================== | ||
#================================================== | ||
# Main Function | ||
|
||
COUSCOus <- function( fasta.file, | ||
verbose = TRUE ) | ||
{ | ||
#================================================== | ||
#================================================== | ||
# Step 1: Load alignment and set required variables | ||
start <- proc.time() | ||
if( verbose ) | ||
{ | ||
print( paste('Step 1: Loading fasta alignment file:', fasta.file ) ) | ||
} | ||
|
||
data.aln <- read.fasta( fasta.file ) | ||
aln.seqs <- data.aln$ali | ||
n.aa <- dim( aln.seqs )[ 2 ] | ||
n <- dim( aln.seqs )[ 1 ] | ||
p <- n.aa | ||
|
||
#================================================== | ||
#================================================== | ||
# Step 2: Preprocess data | ||
start <- proc.time() | ||
if( verbose ) | ||
{ | ||
print( 'Step 2: Preprocess data' ) | ||
} | ||
|
||
list.preprocessing <- preprocessing( aln.seqs ) | ||
S <- list.preprocessing$S | ||
wtsum <- list.preprocessing$wtsum | ||
pa.vec <- list.preprocessing$pa | ||
|
||
#================================================== | ||
#================================================== | ||
# Step 3: Shrink sample covariance matrix S | ||
start <- proc.time() | ||
if( verbose ) | ||
{ | ||
print( 'Step 3: Shrink sample covariance matrix S' ) | ||
} | ||
|
||
### WHY ARE WE MULTIPLYING n and p ALSO BY 21 FOR SHRINKAGE! ### | ||
### TRY BOTH: (i) REGULAR n AND p FROM THE DATA ### | ||
### (ii) n AND p TIMES 21 ### | ||
S.shrinked <- shrink.S( S, | ||
n, | ||
p ) | ||
|
||
#================================================== | ||
#================================================== | ||
# Step 4: Generate (non-negative) regularisation matrix rho (similar to PSICOV) | ||
start <- proc.time() | ||
if( verbose ) | ||
{ | ||
print( 'Step 4: Generate regularisation matrix rho' ) | ||
} | ||
|
||
rho <- generate.rho( wtsum, | ||
pa.vec, | ||
n.aa ) | ||
|
||
#================================================== | ||
#================================================== | ||
# Step 5: Calculate precision matrix | ||
start <- proc.time() | ||
if( verbose ) | ||
{ | ||
print( 'Step 5: Calculate precision matrix' ) | ||
} | ||
|
||
P <- precision( S.shrinked, | ||
rho ) | ||
|
||
#================================================== | ||
# Step 6: Generate prediction data frame | ||
start <- proc.time() | ||
if( verbose ) | ||
{ | ||
print( 'Step 6: Generate prediction data frame' ) | ||
} | ||
|
||
predictions <- prediction( P, | ||
n.aa ) | ||
|
||
#================================================== | ||
# Return predictions | ||
return( predictions ) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#================================================== | ||
#================================================== | ||
# Generate rho matrix | ||
generate.rho <- function( wtsum, | ||
pa.vec, | ||
p, | ||
rhodefault = -1, | ||
maxgapf = 0.9 ) | ||
{ | ||
#================================================== | ||
# Choose rho according to wtsum | ||
if( rhodefault < 0 ) | ||
{ | ||
trialrho <- max( 0.001, 1.0 / wtsum ) | ||
} else | ||
{ | ||
trialrho <- rhodefault | ||
} | ||
|
||
#================================================== | ||
# Chesk if rho is suitable | ||
if( trialrho <= 0 | trialrho >= 1 ) | ||
{ | ||
stop( 'Sorry - failed to find suitable value for rho (0 < rho < 1)!' ) | ||
} | ||
|
||
#================================================== | ||
# Set rho matrix | ||
rho <- numeric( length = p * 21 * p* 21 ) | ||
rho.raw <- .C( 'guess_rho_matrix', | ||
as.double( rho ), | ||
as.double( pa.vec ), | ||
as.double( p ), | ||
as.double( maxgapf ), | ||
as.double( trialrho ) ) | ||
rho.vec <- unlist( rho.raw[ 1 ] ) | ||
rho.mat <- matrix( rho.vec, p * 21, p * 21, byrow = TRUE ) | ||
sum( rho.mat ) | ||
|
||
#================================================== | ||
# Return rho matrix | ||
return( rho.mat ) | ||
} | ||
|
||
|
||
#================================================== | ||
#================================================== | ||
# Run precision matrix calculation | ||
precision <- function( S.shrinked, | ||
rho ) | ||
{ | ||
#================================================== | ||
# Invert sample covariance matrix (shrinked one) | ||
p <- nrow( S.shrinked ) | ||
X <- matrix( 0, p, p ) | ||
W <- matrix( 0, p, p ) | ||
Wd <- rep(0,p) | ||
Wdj <- rep(0,p) | ||
info <- 0 | ||
P <- matrix( .Fortran( 'glassofast', | ||
as.integer( nrow( S.shrinked ) ), | ||
as.double( S.shrinked ), | ||
as.double( rho ), | ||
as.double( 1e-4 ), | ||
as.integer( 1000 ), | ||
as.integer( 0 ), | ||
as.integer( 0 ), | ||
as.double( X ), | ||
as.double( W ), | ||
as.double( Wd ), | ||
as.double( Wdj ), | ||
as.integer( info ) )[[ 8 ]], p, p ) | ||
|
||
#================================================== | ||
# Return precision matrix | ||
return( P ) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#================================================== | ||
#================================================== | ||
# Generate predictions | ||
prediction <- function( P, | ||
n.aa ) | ||
{ | ||
#================================================== | ||
# Calculate contact matrix | ||
P.contact <- P.contact <- matrix( 0, n.aa, n.aa ) | ||
for( i in 1:(n.aa-1) ) | ||
{ | ||
i.start <- ( i - 1 ) * 21 + 1 | ||
i.end <- i * 21 - 1 | ||
for( j in (i+1):n.aa ) | ||
{ | ||
P.contact[ i, j ] <- sum( abs( P[ i.start:i.end, ( ( j - 1 ) * 21 + 1 ):( j * 21 - 1 ) ] ) ) | ||
P.contact[ j, i ] <- P.contact[ i, j ] | ||
} | ||
} | ||
|
||
#================================================== | ||
# Perform APC | ||
if( sum( P.contact != 0 ) == 0 ) | ||
{ | ||
PC <- P.contact | ||
} else | ||
{ | ||
APC <- matrix( NA, n.aa, n.aa ) | ||
mean.P.contact <- sum( P.contact[ upper.tri( P.contact ) ] ) / ( n.aa * ( n.aa - 1 ) * 0.5 ) | ||
for( i in 1:n.aa ) | ||
{ | ||
for( j in 1:n.aa ) | ||
{ | ||
# APC style like in PSICOV: I don't know why they add ( ( n.aa - 1 ) ^ 2 ) in the correction? | ||
APC[ i, j ] <- ( sum( P.contact[ -i, i ] ) * sum( P.contact[ -j, j ] ) ) / ( ( n.aa - 1 ) ^ 2 ) / mean.P.contact | ||
} | ||
} | ||
PC <- P.contact - APC | ||
} | ||
|
||
#================================================== | ||
# Generate dataframe listing all possible contact strengths | ||
predictions.all <- data.frame( cbind( t( combn( n.aa, 2 ) ), PC[ lower.tri( PC ) ] ) ) | ||
colnames( predictions.all ) <- c( 'i', 'j', 'pCorr' ) | ||
predictions.all <- predictions.all[ order( predictions.all$pCorr, decreasing = TRUE ), ] | ||
rownames( predictions.all ) <- 1:nrow( predictions.all ) | ||
|
||
#================================================== | ||
# Return predictions data frame | ||
return( predictions.all ) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#================================================== | ||
#================================================== | ||
# Preprocessing data | ||
|
||
# !!! CHANGE PATHS !!! | ||
preprocessing <- function( aln.seqs, | ||
pseudoc = 1 ) | ||
{ | ||
#================================================== | ||
# Dimension of the alignment | ||
n <- nrow( aln.seqs ) | ||
p <- ncol( aln.seqs ) | ||
|
||
#================================================== | ||
# Convert AA letters to numeric code | ||
aln.seqs.vec <- as.vector( t( aln.seqs ) ) | ||
aln.num.vec <- numeric( length = length( aln.seqs.vec ) ) | ||
aln.num.raw <- .C( 'aa2num', | ||
as.character( aln.seqs.vec ), | ||
as.double( length( aln.seqs.vec ) ), | ||
as.double( aln.num.vec ) ) | ||
aln.num.vec <- unlist( aln.num.raw[ 3 ] ) | ||
|
||
#================================================== | ||
# Calculate sequence weights | ||
mean.frac.id <- c( 0 ) | ||
id.tresh <- c( 0 ) | ||
wtcount <- numeric( length = n ) | ||
weight <- numeric( length = n ) | ||
wtsum <- c( 0 ) | ||
calc.seq.weights <- .C( 'calculate_sequence_weights', | ||
as.double( aln.num.vec ), | ||
as.double( n ), | ||
as.double( p ), | ||
as.double( mean.frac.id ), | ||
as.double( id.tresh ), | ||
as.double( wtcount ), | ||
as.double( weight ), | ||
as.double( wtsum ) ) | ||
weight <- unlist( calc.seq.weights[ 7 ] ) | ||
wtsum <- unlist( calc.seq.weights[ 8 ] ) | ||
|
||
#================================================== | ||
# Get single site frequencies | ||
pa.vec <- numeric( length = p * 21 ) | ||
pseudocount <- c( 1 ) | ||
pa.freq <- .C( 'calculate_single_site_frequencies', | ||
as.double( pa.vec ), | ||
as.double( aln.num.vec ), | ||
as.double( n ), | ||
as.double( p ), | ||
as.double( pseudocount ), | ||
as.double( weight ), | ||
as.double( wtsum ) ) | ||
pa.vec <- unlist( pa.freq[ 1 ] ) | ||
|
||
#================================================== | ||
# Get pair site frequencies | ||
pab.vec <- numeric( length = p * p * 21 * 21 ) | ||
pab.freq <- .C( 'calculate_pair_site_frequencies', | ||
as.double( pab.vec ), | ||
as.double( pa.vec ), | ||
as.double( aln.num.vec ), | ||
as.double( n ), | ||
as.double( p ), | ||
as.double( pseudocount ), | ||
as.double( weight ), | ||
as.double( wtsum ) ) | ||
pab.vec <- unlist( pab.freq[ 1 ] ) | ||
|
||
#================================================== | ||
# Form the sample covariance matrix S | ||
S.vec <- numeric( length = p * p * 21 * 21 ) | ||
S.raw <- .C( 'form_covarience_matrix', | ||
as.double( S.vec ), | ||
as.double( pab.vec ), | ||
as.double( pa.vec ), | ||
as.double( p ) ) | ||
S.vec <- unlist( S.raw[ 1 ] ) | ||
S.mat <- matrix( S.vec, p * 21, p * 21, byrow = TRUE ) | ||
|
||
#================================================== | ||
# Return matrices S and rho | ||
return( list( S = S.mat, | ||
wtsum = wtsum, | ||
pa = pa.vec ) ) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#================================================== | ||
#================================================== | ||
# Shrink sample covariance matrix S | ||
# (Using a maximum entropy bayes covariance estimator) | ||
|
||
shrink.S <- function( S, | ||
n, | ||
p ) | ||
{ | ||
#================================================== | ||
# Test if we need to do shrinkage | ||
boolean <- tryCatch( chol( S ), error = function( e ){ return( FALSE ) } ) | ||
if( boolean ) | ||
{ | ||
print( 'No need for shrinkage!!!' ) | ||
return( S ) | ||
} | ||
|
||
boolean <- FALSE | ||
S.prime <- S + ( ( p - 1 ) / ( n * matrix.trace( S ) ) ) * diag( nrow( S ) ) | ||
|
||
while( boolean == FALSE ) | ||
{ | ||
boolean <- tryCatch( chol( S.prime ), error = function( e ){ return( FALSE ) } ) | ||
if( length( boolean ) > 1 ) | ||
{ | ||
break | ||
} else | ||
{ | ||
S.prime <- S.prime + ( ( p - 1 ) / ( n * matrix.trace( S.prime ) ) ) * diag( nrow( S.prime ) ) | ||
} | ||
} | ||
|
||
#================================================== | ||
# Return shrinked covariance matrix | ||
return( S.prime ) | ||
} |
Oops, something went wrong.