version 1.0.0

cran · Feb 28, 2016 · 78b56b6 · 78b56b6
commit 78b56b6
Show file tree

Hide file tree

Showing 18 changed files with 1,052 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,15 @@
+Package: COUSCOus
+Type: Package
+Title: A Residue-Residue Contact Detecting Method
+Version: 1.0.0
+Date: 2016-02-17
+Author: Reda Rawi [aut,cre], Matyas A. Sustik [aut], Ben Calderhead [aut]
+Maintainer: Reda Rawi <rrawi@qf.org.qa>
+Description: Contact prediction using shrinked covariance (COUSCOus). COUSCOus is a residue-residue contact detecting method approaching the contact inference using the glassofast implementation of Matyas and Sustik (2012, The University of Texas at Austin UTCS Technical Report 2012:1-3. TR-12-29.) that solves the L_1 regularised Gaussian maximum likelihood estimation of the inverse of a covariance matrix. Prior to the inverse covariance matrix estimation we utilise a covariance matrix shrinkage approach, the empirical Bayes covariance estimator, which has been shown by Haff (1980) <DOI:10.1214/aos/1176345010> to be the best estimator in a Bayesian framework, especially dominating estimators of the form aS, such as the smoothed covariance estimator applied in a related contact inference technique PSICOV.
+Repository: CRAN
+License: GPL (>= 3)
+Imports: bio3d (>= 2.2-2), matrixcalc (>= 1.0-3), utils (>= 3.2.2)
+Depends: R (>= 3.2.2)
+NeedsCompilation: yes
+Packaged: 2016-02-28 13:00:30 UTC; reda
+Date/Publication: 2016-02-28 16:26:52
diff --git a/MD5 b/MD5
@@ -0,0 +1,17 @@
+9d859178e2da0036c7a37e6c6cd087b8 *DESCRIPTION
+83467b3dd14b1093ec8b0fbd5f030141 *NAMESPACE
+e032f7efde9755a16ede37bcc01ec88c *R/COUSCOus.R
+b87ec2f0cf15d20daedf954a5258fce1 *R/helper_precision.R
+0fc57b47a097d6402fa47a204a4e8f80 *R/helper_prediction.R
+c55d70ac61c6c70b74b4e67ca08a37d4 *R/helper_preprocess.R
+a903d57ea86a6734d17ac528913c4f36 *R/helper_shrinkage.R
+1652cc1d25db76e10a646a2cb765d7e7 *inst/examples/1oaiA0.fa
+ddcf71ef6c3ef5b55e9a4d7fdb0d2e52 *man/COUSCOus-internal.Rd
+ead3312208b2559bfd5cdccf017e41a5 *man/COUSCOus.Rd
+d14c3df549b301bb80d6caf1ccb5bb22 *src/aa2num.c
+e9b16a0eab395a18f689de6f1259f615 *src/calculate_pair_site_frequencies.c
+0db0659a516d4398256bc5b7e10ac5da *src/calculate_sequence_weights.c
+7fc25371ef6225458a8ed74b1edd55b6 *src/calculate_single_site_frequencies.c
+aa6df00a6912978180ab8354750bd936 *src/form_covarience_matrix.c
+1cc01dbbe27f2eff9a41b62b6d636689 *src/glassofast.f90
+710e5a462164809453e43dfe1af3ca0e *src/guess_rho_matrix.c
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,3 @@
+useDynLib(COUSCOus)
+exportPattern("^[[:alpha:]]+")
+import("bio3d","matrixcalc","utils")
diff --git a/R/COUSCOus.R b/R/COUSCOus.R
@@ -0,0 +1,93 @@
+#==================================================
+#==================================================
+#==================================================
+# Main Function
+
+COUSCOus <- function( fasta.file,
+                      verbose = TRUE )
+{
+  #==================================================
+  #==================================================
+  # Step 1: Load alignment and set required variables
+  start <- proc.time()
+  if( verbose )
+  {
+    print( paste('Step 1: Loading fasta alignment file:', fasta.file ) )
+  }
+
+  data.aln <- read.fasta( fasta.file )
+  aln.seqs <- data.aln$ali
+  n.aa <- dim( aln.seqs )[ 2 ]
+  n <- dim( aln.seqs )[ 1 ]
+  p <- n.aa
+
+  #==================================================
+  #==================================================
+  # Step 2: Preprocess data
+  start <- proc.time()
+  if( verbose )
+  {
+    print( 'Step 2: Preprocess data' )
+  }
+
+  list.preprocessing <- preprocessing( aln.seqs )
+  S <- list.preprocessing$S
+  wtsum <- list.preprocessing$wtsum
+  pa.vec <- list.preprocessing$pa
+
+  #==================================================
+  #==================================================
+  # Step 3: Shrink sample covariance matrix S
+  start <- proc.time()
+  if( verbose )
+  {
+    print( 'Step 3: Shrink sample covariance matrix S' )
+  }
+
+  ### WHY ARE WE MULTIPLYING n and p ALSO BY 21 FOR SHRINKAGE!  ###
+  ### TRY BOTH: (i)   REGULAR n AND p FROM THE DATA             ###
+  ###           (ii)  n AND p TIMES 21                          ###
+  S.shrinked <- shrink.S( S,
+                          n,
+                          p )
+
+  #==================================================
+  #==================================================
+  # Step 4: Generate (non-negative) regularisation matrix rho (similar to PSICOV)
+  start <- proc.time()
+  if( verbose )
+  {
+    print( 'Step 4: Generate regularisation matrix rho' )
+  }
+
+  rho <- generate.rho( wtsum,
+                       pa.vec,
+                       n.aa )
+
+  #==================================================
+  #==================================================
+  # Step 5: Calculate precision matrix
+  start <- proc.time()
+  if( verbose )
+  {
+    print( 'Step 5: Calculate precision matrix' )
+  }
+
+  P <- precision( S.shrinked,
+                  rho )
+
+  #==================================================
+  # Step 6: Generate prediction data frame
+  start <- proc.time()
+  if( verbose )
+  {
+    print( 'Step 6: Generate prediction data frame' )
+  }
+
+  predictions <- prediction( P,
+                             n.aa )
+
+  #==================================================
+  # Return predictions
+  return( predictions )
+}
diff --git a/R/helper_precision.R b/R/helper_precision.R
@@ -0,0 +1,77 @@
+#==================================================
+#==================================================
+# Generate rho matrix
+generate.rho <- function( wtsum,
+                          pa.vec,
+                          p,
+                          rhodefault = -1,
+                          maxgapf = 0.9 )
+{
+  #==================================================
+  # Choose rho according to wtsum
+  if( rhodefault < 0 )
+  {
+    trialrho <- max( 0.001, 1.0 / wtsum )
+  } else
+  {
+    trialrho <- rhodefault
+  }
+
+  #==================================================
+  # Chesk if rho is suitable
+  if( trialrho <= 0 | trialrho >= 1 )
+  {
+    stop( 'Sorry - failed to find suitable value for rho (0 < rho < 1)!' )
+  }
+
+  #==================================================
+  # Set rho matrix
+  rho <- numeric( length = p * 21 * p* 21 )
+  rho.raw <- .C( 'guess_rho_matrix',
+                 as.double( rho ),
+                 as.double( pa.vec ),
+                 as.double( p ),
+                 as.double( maxgapf ),
+                 as.double( trialrho ) )
+  rho.vec <- unlist( rho.raw[ 1 ] )
+  rho.mat <- matrix( rho.vec, p * 21, p * 21, byrow = TRUE )
+  sum( rho.mat )
+
+  #==================================================
+  # Return rho matrix
+  return( rho.mat )
+}
+
+
+#==================================================
+#==================================================
+# Run precision matrix calculation
+precision <- function( S.shrinked,
+                       rho )
+{
+  #==================================================
+  # Invert sample covariance matrix (shrinked one)
+  p <- nrow( S.shrinked )
+  X <- matrix( 0, p, p )
+  W <- matrix( 0, p, p )
+  Wd <- rep(0,p)
+  Wdj <- rep(0,p)
+  info <- 0
+  P <- matrix( .Fortran( 'glassofast',
+                         as.integer( nrow( S.shrinked ) ),
+                         as.double( S.shrinked ),
+                         as.double( rho ),
+                         as.double( 1e-4 ),
+                         as.integer( 1000 ),
+                         as.integer( 0 ),
+                         as.integer( 0 ),
+                         as.double( X ),
+                         as.double( W ),
+                         as.double( Wd ),
+                         as.double( Wdj ),
+                         as.integer( info ) )[[ 8 ]], p, p )
+
+  #==================================================
+  # Return precision matrix
+  return( P )
+}
diff --git a/R/helper_prediction.R b/R/helper_prediction.R
@@ -0,0 +1,51 @@
+#==================================================
+#==================================================
+# Generate predictions
+prediction <- function( P,
+                        n.aa )
+{
+  #==================================================
+  # Calculate contact matrix
+  P.contact <- P.contact <- matrix( 0, n.aa, n.aa )
+  for( i in 1:(n.aa-1) )
+  {
+    i.start <- ( i - 1 )  * 21 + 1
+    i.end <- i * 21 - 1
+    for( j in (i+1):n.aa )
+    {
+      P.contact[ i, j ] <- sum( abs( P[ i.start:i.end, ( ( j - 1 ) * 21 + 1 ):( j * 21 - 1 ) ] ) )
+      P.contact[ j, i ] <- P.contact[ i, j ]
+    }
+  }
+
+  #==================================================
+  # Perform APC
+  if( sum( P.contact != 0 ) == 0 )
+  {
+    PC <- P.contact
+  } else
+  {
+    APC <- matrix( NA, n.aa, n.aa )
+    mean.P.contact <- sum( P.contact[ upper.tri( P.contact ) ] ) / ( n.aa * ( n.aa - 1 ) * 0.5 )
+    for( i in 1:n.aa )
+    {
+      for( j in 1:n.aa )
+      {
+        # APC style like in PSICOV: I don't know why they add ( ( n.aa - 1 ) ^ 2 ) in the correction?
+        APC[ i, j ] <- ( sum( P.contact[ -i, i ] ) * sum( P.contact[ -j, j ] ) ) / ( ( n.aa - 1 ) ^ 2 ) / mean.P.contact
+      }
+    }
+    PC <- P.contact - APC
+  }
+
+  #==================================================
+  # Generate dataframe listing all possible contact strengths
+  predictions.all <- data.frame( cbind( t( combn( n.aa, 2 ) ), PC[ lower.tri( PC ) ] ) )
+  colnames( predictions.all ) <- c( 'i', 'j', 'pCorr' )
+  predictions.all <- predictions.all[ order( predictions.all$pCorr, decreasing = TRUE ), ]
+  rownames( predictions.all ) <- 1:nrow( predictions.all )
+
+  #==================================================
+  # Return predictions data frame
+  return( predictions.all )
+}
diff --git a/R/helper_preprocess.R b/R/helper_preprocess.R
@@ -0,0 +1,87 @@
+#==================================================
+#==================================================
+# Preprocessing data
+
+# !!! CHANGE PATHS !!!
+preprocessing <- function( aln.seqs,
+                           pseudoc = 1 )
+{
+  #==================================================
+  # Dimension of the alignment
+  n <- nrow( aln.seqs )
+  p <- ncol( aln.seqs )
+
+  #==================================================
+  # Convert AA letters to numeric code
+  aln.seqs.vec <- as.vector( t( aln.seqs ) )
+  aln.num.vec <- numeric( length = length( aln.seqs.vec ) )
+  aln.num.raw <- .C( 'aa2num',
+                     as.character( aln.seqs.vec ),
+                     as.double( length( aln.seqs.vec ) ),
+                     as.double( aln.num.vec ) )
+  aln.num.vec <- unlist( aln.num.raw[ 3 ] )
+
+  #==================================================
+  # Calculate sequence weights
+  mean.frac.id <- c( 0 )
+  id.tresh <- c( 0 )
+  wtcount <- numeric( length = n )
+  weight <- numeric( length = n )
+  wtsum <- c( 0 )
+  calc.seq.weights <- .C( 'calculate_sequence_weights',
+                          as.double( aln.num.vec ),
+                          as.double( n ),
+                          as.double( p ),
+                          as.double( mean.frac.id ),
+                          as.double( id.tresh ),
+                          as.double( wtcount ),
+                          as.double( weight ),
+                          as.double( wtsum ) )
+  weight <- unlist( calc.seq.weights[ 7 ] )
+  wtsum <- unlist( calc.seq.weights[ 8 ] )
+
+  #==================================================
+  # Get single site frequencies
+  pa.vec <- numeric( length = p * 21 )
+  pseudocount <- c( 1 )
+  pa.freq <- .C( 'calculate_single_site_frequencies',
+                 as.double( pa.vec ),
+                 as.double( aln.num.vec ),
+                 as.double( n ),
+                 as.double( p ),
+                 as.double( pseudocount ),
+                 as.double( weight ),
+                 as.double( wtsum ) )
+  pa.vec <- unlist( pa.freq[ 1 ] )
+
+  #==================================================
+  # Get pair site frequencies
+  pab.vec <- numeric( length = p * p * 21 * 21 )
+  pab.freq <- .C( 'calculate_pair_site_frequencies',
+                  as.double( pab.vec ),
+                  as.double( pa.vec ),
+                  as.double( aln.num.vec ),
+                  as.double( n ),
+                  as.double( p ),
+                  as.double( pseudocount ),
+                  as.double( weight ),
+                  as.double( wtsum ) )
+  pab.vec <- unlist( pab.freq[ 1 ] )
+
+  #==================================================
+  # Form the sample covariance matrix S
+  S.vec <- numeric( length = p * p * 21 * 21 )
+  S.raw <- .C( 'form_covarience_matrix',
+               as.double( S.vec ),
+               as.double( pab.vec ),
+               as.double( pa.vec ),
+               as.double( p ) )
+  S.vec <- unlist( S.raw[ 1 ] )
+  S.mat <- matrix( S.vec, p * 21, p * 21, byrow = TRUE )
+
+  #==================================================
+  # Return matrices S and rho
+  return( list( S = S.mat,
+                wtsum = wtsum,
+                pa = pa.vec ) )
+}
diff --git a/R/helper_shrinkage.R b/R/helper_shrinkage.R
@@ -0,0 +1,37 @@
+#==================================================
+#==================================================
+# Shrink sample covariance matrix S
+# (Using a maximum entropy bayes covariance estimator)
+
+shrink.S <- function( S,
+                      n,
+                      p )
+{
+  #==================================================
+  # Test if we need to do shrinkage
+  boolean <- tryCatch( chol( S ), error = function( e ){ return( FALSE ) } )
+  if( boolean )
+  {
+    print( 'No need for shrinkage!!!' )
+    return( S )
+  }
+
+  boolean <- FALSE
+  S.prime <- S + ( ( p - 1 ) / ( n * matrix.trace( S ) ) ) * diag( nrow( S ) )
+
+  while( boolean == FALSE )
+  {
+    boolean <- tryCatch( chol( S.prime ), error = function( e ){ return( FALSE ) } )
+    if( length( boolean ) > 1 )
+    {
+      break
+    } else
+    {
+      S.prime <- S.prime + ( ( p - 1 ) / ( n * matrix.trace( S.prime ) ) ) * diag( nrow( S.prime ) )
+    }
+  }
+
+  #==================================================
+  # Return shrinked covariance matrix
+  return( S.prime )
+}