Skip to content

Commit

Permalink
version 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
rrawi authored and cran-robot committed Feb 28, 2016
0 parents commit 78b56b6
Show file tree
Hide file tree
Showing 18 changed files with 1,052 additions and 0 deletions.
15 changes: 15 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,15 @@
Package: COUSCOus
Type: Package
Title: A Residue-Residue Contact Detecting Method
Version: 1.0.0
Date: 2016-02-17
Author: Reda Rawi [aut,cre], Matyas A. Sustik [aut], Ben Calderhead [aut]
Maintainer: Reda Rawi <rrawi@qf.org.qa>
Description: Contact prediction using shrinked covariance (COUSCOus). COUSCOus is a residue-residue contact detecting method approaching the contact inference using the glassofast implementation of Matyas and Sustik (2012, The University of Texas at Austin UTCS Technical Report 2012:1-3. TR-12-29.) that solves the L_1 regularised Gaussian maximum likelihood estimation of the inverse of a covariance matrix. Prior to the inverse covariance matrix estimation we utilise a covariance matrix shrinkage approach, the empirical Bayes covariance estimator, which has been shown by Haff (1980) <DOI:10.1214/aos/1176345010> to be the best estimator in a Bayesian framework, especially dominating estimators of the form aS, such as the smoothed covariance estimator applied in a related contact inference technique PSICOV.
Repository: CRAN
License: GPL (>= 3)
Imports: bio3d (>= 2.2-2), matrixcalc (>= 1.0-3), utils (>= 3.2.2)
Depends: R (>= 3.2.2)
NeedsCompilation: yes
Packaged: 2016-02-28 13:00:30 UTC; reda
Date/Publication: 2016-02-28 16:26:52
17 changes: 17 additions & 0 deletions MD5
@@ -0,0 +1,17 @@
9d859178e2da0036c7a37e6c6cd087b8 *DESCRIPTION
83467b3dd14b1093ec8b0fbd5f030141 *NAMESPACE
e032f7efde9755a16ede37bcc01ec88c *R/COUSCOus.R
b87ec2f0cf15d20daedf954a5258fce1 *R/helper_precision.R
0fc57b47a097d6402fa47a204a4e8f80 *R/helper_prediction.R
c55d70ac61c6c70b74b4e67ca08a37d4 *R/helper_preprocess.R
a903d57ea86a6734d17ac528913c4f36 *R/helper_shrinkage.R
1652cc1d25db76e10a646a2cb765d7e7 *inst/examples/1oaiA0.fa
ddcf71ef6c3ef5b55e9a4d7fdb0d2e52 *man/COUSCOus-internal.Rd
ead3312208b2559bfd5cdccf017e41a5 *man/COUSCOus.Rd
d14c3df549b301bb80d6caf1ccb5bb22 *src/aa2num.c
e9b16a0eab395a18f689de6f1259f615 *src/calculate_pair_site_frequencies.c
0db0659a516d4398256bc5b7e10ac5da *src/calculate_sequence_weights.c
7fc25371ef6225458a8ed74b1edd55b6 *src/calculate_single_site_frequencies.c
aa6df00a6912978180ab8354750bd936 *src/form_covarience_matrix.c
1cc01dbbe27f2eff9a41b62b6d636689 *src/glassofast.f90
710e5a462164809453e43dfe1af3ca0e *src/guess_rho_matrix.c
3 changes: 3 additions & 0 deletions NAMESPACE
@@ -0,0 +1,3 @@
useDynLib(COUSCOus)
exportPattern("^[[:alpha:]]+")
import("bio3d","matrixcalc","utils")
93 changes: 93 additions & 0 deletions R/COUSCOus.R
@@ -0,0 +1,93 @@
#==================================================
#==================================================
#==================================================
# Main Function

COUSCOus <- function( fasta.file,
verbose = TRUE )
{
#==================================================
#==================================================
# Step 1: Load alignment and set required variables
start <- proc.time()
if( verbose )
{
print( paste('Step 1: Loading fasta alignment file:', fasta.file ) )
}

data.aln <- read.fasta( fasta.file )
aln.seqs <- data.aln$ali
n.aa <- dim( aln.seqs )[ 2 ]
n <- dim( aln.seqs )[ 1 ]
p <- n.aa

#==================================================
#==================================================
# Step 2: Preprocess data
start <- proc.time()
if( verbose )
{
print( 'Step 2: Preprocess data' )
}

list.preprocessing <- preprocessing( aln.seqs )
S <- list.preprocessing$S
wtsum <- list.preprocessing$wtsum
pa.vec <- list.preprocessing$pa

#==================================================
#==================================================
# Step 3: Shrink sample covariance matrix S
start <- proc.time()
if( verbose )
{
print( 'Step 3: Shrink sample covariance matrix S' )
}

### WHY ARE WE MULTIPLYING n and p ALSO BY 21 FOR SHRINKAGE! ###
### TRY BOTH: (i) REGULAR n AND p FROM THE DATA ###
### (ii) n AND p TIMES 21 ###
S.shrinked <- shrink.S( S,
n,
p )

#==================================================
#==================================================
# Step 4: Generate (non-negative) regularisation matrix rho (similar to PSICOV)
start <- proc.time()
if( verbose )
{
print( 'Step 4: Generate regularisation matrix rho' )
}

rho <- generate.rho( wtsum,
pa.vec,
n.aa )

#==================================================
#==================================================
# Step 5: Calculate precision matrix
start <- proc.time()
if( verbose )
{
print( 'Step 5: Calculate precision matrix' )
}

P <- precision( S.shrinked,
rho )

#==================================================
# Step 6: Generate prediction data frame
start <- proc.time()
if( verbose )
{
print( 'Step 6: Generate prediction data frame' )
}

predictions <- prediction( P,
n.aa )

#==================================================
# Return predictions
return( predictions )
}
77 changes: 77 additions & 0 deletions R/helper_precision.R
@@ -0,0 +1,77 @@
#==================================================
#==================================================
# Generate rho matrix
generate.rho <- function( wtsum,
pa.vec,
p,
rhodefault = -1,
maxgapf = 0.9 )
{
#==================================================
# Choose rho according to wtsum
if( rhodefault < 0 )
{
trialrho <- max( 0.001, 1.0 / wtsum )
} else
{
trialrho <- rhodefault
}

#==================================================
# Chesk if rho is suitable
if( trialrho <= 0 | trialrho >= 1 )
{
stop( 'Sorry - failed to find suitable value for rho (0 < rho < 1)!' )
}

#==================================================
# Set rho matrix
rho <- numeric( length = p * 21 * p* 21 )
rho.raw <- .C( 'guess_rho_matrix',
as.double( rho ),
as.double( pa.vec ),
as.double( p ),
as.double( maxgapf ),
as.double( trialrho ) )
rho.vec <- unlist( rho.raw[ 1 ] )
rho.mat <- matrix( rho.vec, p * 21, p * 21, byrow = TRUE )
sum( rho.mat )

#==================================================
# Return rho matrix
return( rho.mat )
}


#==================================================
#==================================================
# Run precision matrix calculation
precision <- function( S.shrinked,
rho )
{
#==================================================
# Invert sample covariance matrix (shrinked one)
p <- nrow( S.shrinked )
X <- matrix( 0, p, p )
W <- matrix( 0, p, p )
Wd <- rep(0,p)
Wdj <- rep(0,p)
info <- 0
P <- matrix( .Fortran( 'glassofast',
as.integer( nrow( S.shrinked ) ),
as.double( S.shrinked ),
as.double( rho ),
as.double( 1e-4 ),
as.integer( 1000 ),
as.integer( 0 ),
as.integer( 0 ),
as.double( X ),
as.double( W ),
as.double( Wd ),
as.double( Wdj ),
as.integer( info ) )[[ 8 ]], p, p )

#==================================================
# Return precision matrix
return( P )
}
51 changes: 51 additions & 0 deletions R/helper_prediction.R
@@ -0,0 +1,51 @@
#==================================================
#==================================================
# Generate predictions
prediction <- function( P,
n.aa )
{
#==================================================
# Calculate contact matrix
P.contact <- P.contact <- matrix( 0, n.aa, n.aa )
for( i in 1:(n.aa-1) )
{
i.start <- ( i - 1 ) * 21 + 1
i.end <- i * 21 - 1
for( j in (i+1):n.aa )
{
P.contact[ i, j ] <- sum( abs( P[ i.start:i.end, ( ( j - 1 ) * 21 + 1 ):( j * 21 - 1 ) ] ) )
P.contact[ j, i ] <- P.contact[ i, j ]
}
}

#==================================================
# Perform APC
if( sum( P.contact != 0 ) == 0 )
{
PC <- P.contact
} else
{
APC <- matrix( NA, n.aa, n.aa )
mean.P.contact <- sum( P.contact[ upper.tri( P.contact ) ] ) / ( n.aa * ( n.aa - 1 ) * 0.5 )
for( i in 1:n.aa )
{
for( j in 1:n.aa )
{
# APC style like in PSICOV: I don't know why they add ( ( n.aa - 1 ) ^ 2 ) in the correction?
APC[ i, j ] <- ( sum( P.contact[ -i, i ] ) * sum( P.contact[ -j, j ] ) ) / ( ( n.aa - 1 ) ^ 2 ) / mean.P.contact
}
}
PC <- P.contact - APC
}

#==================================================
# Generate dataframe listing all possible contact strengths
predictions.all <- data.frame( cbind( t( combn( n.aa, 2 ) ), PC[ lower.tri( PC ) ] ) )
colnames( predictions.all ) <- c( 'i', 'j', 'pCorr' )
predictions.all <- predictions.all[ order( predictions.all$pCorr, decreasing = TRUE ), ]
rownames( predictions.all ) <- 1:nrow( predictions.all )

#==================================================
# Return predictions data frame
return( predictions.all )
}
87 changes: 87 additions & 0 deletions R/helper_preprocess.R
@@ -0,0 +1,87 @@
#==================================================
#==================================================
# Preprocessing data

# !!! CHANGE PATHS !!!
preprocessing <- function( aln.seqs,
pseudoc = 1 )
{
#==================================================
# Dimension of the alignment
n <- nrow( aln.seqs )
p <- ncol( aln.seqs )

#==================================================
# Convert AA letters to numeric code
aln.seqs.vec <- as.vector( t( aln.seqs ) )
aln.num.vec <- numeric( length = length( aln.seqs.vec ) )
aln.num.raw <- .C( 'aa2num',
as.character( aln.seqs.vec ),
as.double( length( aln.seqs.vec ) ),
as.double( aln.num.vec ) )
aln.num.vec <- unlist( aln.num.raw[ 3 ] )

#==================================================
# Calculate sequence weights
mean.frac.id <- c( 0 )
id.tresh <- c( 0 )
wtcount <- numeric( length = n )
weight <- numeric( length = n )
wtsum <- c( 0 )
calc.seq.weights <- .C( 'calculate_sequence_weights',
as.double( aln.num.vec ),
as.double( n ),
as.double( p ),
as.double( mean.frac.id ),
as.double( id.tresh ),
as.double( wtcount ),
as.double( weight ),
as.double( wtsum ) )
weight <- unlist( calc.seq.weights[ 7 ] )
wtsum <- unlist( calc.seq.weights[ 8 ] )

#==================================================
# Get single site frequencies
pa.vec <- numeric( length = p * 21 )
pseudocount <- c( 1 )
pa.freq <- .C( 'calculate_single_site_frequencies',
as.double( pa.vec ),
as.double( aln.num.vec ),
as.double( n ),
as.double( p ),
as.double( pseudocount ),
as.double( weight ),
as.double( wtsum ) )
pa.vec <- unlist( pa.freq[ 1 ] )

#==================================================
# Get pair site frequencies
pab.vec <- numeric( length = p * p * 21 * 21 )
pab.freq <- .C( 'calculate_pair_site_frequencies',
as.double( pab.vec ),
as.double( pa.vec ),
as.double( aln.num.vec ),
as.double( n ),
as.double( p ),
as.double( pseudocount ),
as.double( weight ),
as.double( wtsum ) )
pab.vec <- unlist( pab.freq[ 1 ] )

#==================================================
# Form the sample covariance matrix S
S.vec <- numeric( length = p * p * 21 * 21 )
S.raw <- .C( 'form_covarience_matrix',
as.double( S.vec ),
as.double( pab.vec ),
as.double( pa.vec ),
as.double( p ) )
S.vec <- unlist( S.raw[ 1 ] )
S.mat <- matrix( S.vec, p * 21, p * 21, byrow = TRUE )

#==================================================
# Return matrices S and rho
return( list( S = S.mat,
wtsum = wtsum,
pa = pa.vec ) )
}
37 changes: 37 additions & 0 deletions R/helper_shrinkage.R
@@ -0,0 +1,37 @@
#==================================================
#==================================================
# Shrink sample covariance matrix S
# (Using a maximum entropy bayes covariance estimator)

shrink.S <- function( S,
n,
p )
{
#==================================================
# Test if we need to do shrinkage
boolean <- tryCatch( chol( S ), error = function( e ){ return( FALSE ) } )
if( boolean )
{
print( 'No need for shrinkage!!!' )
return( S )
}

boolean <- FALSE
S.prime <- S + ( ( p - 1 ) / ( n * matrix.trace( S ) ) ) * diag( nrow( S ) )

while( boolean == FALSE )
{
boolean <- tryCatch( chol( S.prime ), error = function( e ){ return( FALSE ) } )
if( length( boolean ) > 1 )
{
break
} else
{
S.prime <- S.prime + ( ( p - 1 ) / ( n * matrix.trace( S.prime ) ) ) * diag( nrow( S.prime ) )
}
}

#==================================================
# Return shrinked covariance matrix
return( S.prime )
}

0 comments on commit 78b56b6

Please sign in to comment.