diff --git a/colims-core/src/main/java/com/compomics/colims/core/io/fasta/FastaDbParser.java b/colims-core/src/main/java/com/compomics/colims/core/io/fasta/FastaDbParser.java index a614668c..0b24ba9e 100644 --- a/colims-core/src/main/java/com/compomics/colims/core/io/fasta/FastaDbParser.java +++ b/colims-core/src/main/java/com/compomics/colims/core/io/fasta/FastaDbParser.java @@ -1,6 +1,7 @@ package com.compomics.colims.core.io.fasta; import com.compomics.colims.model.FastaDb; +import com.compomics.colims.model.Protein; import com.compomics.colims.model.enums.SearchEngineType; import com.compomics.util.protein.Header; import org.slf4j.Logger; @@ -42,8 +43,8 @@ public class FastaDbParser { * @return the protein sequences map (key: protein accession; value: protein sequence) * @throws IOException thrown in case of an input/output related problem */ - public Map parse(LinkedHashMap fastaDbs) throws IOException { - Map proteinSequences = new HashMap<>(); + public Map parse(LinkedHashMap fastaDbs) throws IOException { + Map proteinSequences = new HashMap<>(); try { for (Map.Entry entry : fastaDbs.entrySet()) { FastaDb fastaDb = entry.getKey(); @@ -153,7 +154,7 @@ public LinkedHashMap testParseRule(Path fastaPath, String parseR * @param fastaPath the FASTA path * @throws IOException in case of file reading related problem */ - private void parseWithRule(Map proteinSequences, FastaDb fastaDb, Path fastaPath) throws IOException { + private void parseWithRule(Map proteinSequences, FastaDb fastaDb, Path fastaPath) throws IOException { try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) { //compile the pattern Pattern pattern; @@ -170,11 +171,23 @@ private void parseWithRule(Map proteinSequences, FastaDb fastaDb if (line.startsWith(BLOCK_SEPARATOR)) { //add limiting check for protein store to avoid growing if (sequenceBuilder.length() > 0) { + Protein protein = new Protein(); + protein.setSequence(sequenceBuilder.toString().trim()); + String regex = "(? 2){ + if(fastaHeader.substring(1).split(regex)[2].split(SPLITTER).length >1){ + String description =String.join(SPLITTER, Arrays.copyOfRange(fastaHeader.substring(1).split(regex)[2].split(SPLITTER), 1, fastaHeader.substring(1).split(regex)[2].split(SPLITTER).length)); + if(description.contains(" OS=")){ + protein.setDescription(description.substring(0, description.indexOf(" OS="))); + } + } + + } Matcher matcher = pattern.matcher(fastaHeader.substring(1)); if (matcher.find()) { - proteinSequences.putIfAbsent(matcher.group(1), sequenceBuilder.toString().trim()); + proteinSequences.putIfAbsent(matcher.group(1), protein); } else { - proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], sequenceBuilder.toString().trim()); + proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein); } sequenceBuilder.setLength(0); } @@ -185,11 +198,19 @@ private void parseWithRule(Map proteinSequences, FastaDb fastaDb } //last line if (sequenceBuilder.length() > 0) { + Protein protein = new Protein(); + protein.setSequence(sequenceBuilder.toString().trim()); + if(fastaHeader.substring(1).split("|").length > 2 && fastaHeader.substring(1).split("|")[2].split(SPLITTER).length >1){ + String description = fastaHeader.substring(1).split("|")[2].split(SPLITTER)[1]; + if(description.contains(" OS=")){ + protein.setDescription(description.substring(0, description.indexOf("OS="))); + } + } Matcher matcher = pattern.matcher(fastaHeader.substring(1).split(SPLITTER)[0]); if (matcher.find()) { - proteinSequences.putIfAbsent(matcher.group(1), sequenceBuilder.toString().trim()); + proteinSequences.putIfAbsent(matcher.group(1), protein); } else { - proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], sequenceBuilder.toString().trim()); + proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein); } sequenceBuilder.setLength(0); } @@ -203,7 +224,7 @@ private void parseWithRule(Map proteinSequences, FastaDb fastaDb * @param fastaPath the FASTA path * @throws IOException in case of file reading related problem */ - private void parseWithoutRule(Map proteinSequences, Path fastaPath) throws IOException { + private void parseWithoutRule(Map proteinSequences, Path fastaPath) throws IOException { try (BufferedReader bufferedReader = Files.newBufferedReader(fastaPath)) { //start reading the file final StringBuilder sequenceBuilder = new StringBuilder(); @@ -213,7 +234,10 @@ private void parseWithoutRule(Map proteinSequences, Path fastaPa if (line.startsWith(BLOCK_SEPARATOR)) { //add limiting check for protein store to avoid growing if (sequenceBuilder.length() > 0) { - proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], sequenceBuilder.toString().trim()); + Protein protein = new Protein(); + protein.setSequence(sequenceBuilder.toString().trim()); + // protein.setDescription(fastaHeader.substring(1).split(SPLITTER)[1]); + proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein); sequenceBuilder.setLength(0); } fastaHeader = line; @@ -223,7 +247,9 @@ private void parseWithoutRule(Map proteinSequences, Path fastaPa } //last line if (sequenceBuilder.length() > 0) { - proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], sequenceBuilder.toString().trim()); + Protein protein = new Protein(); + protein.setSequence(sequenceBuilder.toString().trim()); + proteinSequences.putIfAbsent(fastaHeader.substring(1).split(SPLITTER)[0], protein); sequenceBuilder.setLength(0); } } diff --git a/colims-core/src/main/java/com/compomics/colims/core/playground/Playground.java b/colims-core/src/main/java/com/compomics/colims/core/playground/Playground.java index 1190cba5..ee0956d5 100644 --- a/colims-core/src/main/java/com/compomics/colims/core/playground/Playground.java +++ b/colims-core/src/main/java/com/compomics/colims/core/playground/Playground.java @@ -57,7 +57,7 @@ public static void main(final String[] args) throws IOException, ClassNotFoundEx testFastaDb.setDatabaseName("test db"); LinkedHashMap fastaDbs = new LinkedHashMap<>(); fastaDbs.put(testFastaDb, Paths.get(testFastaDb.getFilePath())); - Map parse = fastaDbParser.parse(fastaDbs); + // Map parse = fastaDbParser.parse(fastaDbs); System.out.println("=---------------"); } diff --git a/colims-core/src/main/java/com/compomics/colims/core/service/ProteinService.java b/colims-core/src/main/java/com/compomics/colims/core/service/ProteinService.java index 1922efe2..12b4ff71 100644 --- a/colims-core/src/main/java/com/compomics/colims/core/service/ProteinService.java +++ b/colims-core/src/main/java/com/compomics/colims/core/service/ProteinService.java @@ -22,9 +22,10 @@ public interface ProteinService extends GenericService { * looks for the protein in the cache first before querying the database. * * @param sequence the protein sequence + * @param description the protein description * @return the found Protein instance */ - Protein getProtein(String sequence); + Protein getProtein(String sequence, String description); /** * Clear the resources used by this resource. diff --git a/colims-core/src/main/java/com/compomics/colims/core/service/impl/ProteinServiceImpl.java b/colims-core/src/main/java/com/compomics/colims/core/service/impl/ProteinServiceImpl.java index cf7b4af5..cc9e9ddc 100644 --- a/colims-core/src/main/java/com/compomics/colims/core/service/impl/ProteinServiceImpl.java +++ b/colims-core/src/main/java/com/compomics/colims/core/service/impl/ProteinServiceImpl.java @@ -67,12 +67,13 @@ public void remove(Protein entity) { } @Override - public Protein getProtein(String sequence) { + public Protein getProtein(String sequence, String description) { Protein targetProtein; //first, look in the newly added proteins map //@todo configure hibernate cache and check performance targetProtein = cachedProteins.get(sequence); + if (targetProtein == null) { //check if the protein is found in the db targetProtein = findBySequence(sequence); @@ -82,8 +83,13 @@ public Protein getProtein(String sequence) { targetProtein = new Protein(sequence); } + if(targetProtein.getDescription() == null && description != null){ + targetProtein.setDescription(description); + } //add to cached proteins cachedProteins.put(sequence, targetProtein); + }else if(targetProtein.getDescription() == null && description != null){ + targetProtein.setDescription(description); } return targetProtein; } diff --git a/colims-core/src/test/java/com/compomics/colims/core/io/fasta/FastaDbParserTest.java b/colims-core/src/test/java/com/compomics/colims/core/io/fasta/FastaDbParserTest.java index 2eb2f73a..d4195d62 100644 --- a/colims-core/src/test/java/com/compomics/colims/core/io/fasta/FastaDbParserTest.java +++ b/colims-core/src/test/java/com/compomics/colims/core/io/fasta/FastaDbParserTest.java @@ -1,6 +1,7 @@ package com.compomics.colims.core.io.fasta; import com.compomics.colims.model.FastaDb; +import com.compomics.colims.model.Protein; import com.compomics.colims.model.enums.SearchEngineType; import org.junit.Assert; import org.junit.Test; @@ -52,7 +53,7 @@ public void testParse() throws IOException { fastaDbs.put(testFastaDb, Paths.get(testFastaDb.getFilePath())); fastaDbs.put(contaminantsFastaDb, Paths.get(contaminantsFastaDb.getFilePath())); - Map parsedFastas = fastaDbParser.parse(fastaDbs); + Map parsedFastas = fastaDbParser.parse(fastaDbs); Assert.assertFalse(parsedFastas.containsKey(null)); Assert.assertEquals(20381, parsedFastas.size()); @@ -64,7 +65,8 @@ public void testParse() throws IOException { Assert.assertTrue(parsedFastas.containsKey("Q2KJ03")); Assert.assertEquals("MEEKTTQSVEGLKQYCLVPEREMKHIERHIHQTGKAGEFKNKPFRQVLQPPNETKLP" + "KIMPEGHGIQNAQRRKQVNEREQMQTKDHQERMIRGRELAEQRLKERILRRSQSQLLTYEKHERVKEIK" + - "EFERVIAYLLFQPCSRSRIKVSILMDKSQNGEKVNTIVKPYQRKFLAMPPFLRSQIGKIRD", parsedFastas.get("Q2KJ03")); + "EFERVIAYLLFQPCSRSRIKVSILMDKSQNGEKVNTIVKPYQRKFLAMPPFLRSQIGKIRD", parsedFastas.get("Q2KJ03").getSequence()); + Assert.assertEquals("Putative uncharacterized protein ZNRD1-AS1", parsedFastas.get("Q2KJ03").getDescription()); //look for 3 contaminants proteins Assert.assertTrue(parsedFastas.containsKey("P00761")); Assert.assertTrue(parsedFastas.containsKey("Q9TRI1")); @@ -81,7 +83,9 @@ public void testParse() throws IOException { "DLDSIIAEVRAQYEEIAQRSKAEAEALYQTKVQQLQISVDQHGDNLKNTKSEIAELNRMIQRLRAEIE" + "NIKKQCQTLQVSVADAEQRGENALKDAHSKRVELEAALQQAKEELARMLREYQELMSVKLALDIEIAT" + "YRKLLEGEEYRMSGECQSAVSISVVSGSTSTGGISGGLGSGSGFGLSSGFGSGSGSGFGFGGSVSGSS" + - "SSKIISTTTLNKRR", parsedFastas.get("P19013")); + "SSKIISTTTLNKRR", parsedFastas.get("P19013").getSequence()); + Assert.assertEquals("Keratin, type II cytoskeletal 4", parsedFastas.get("P19013").getDescription()); + } @Test diff --git a/colims-distributed/src/main/java/com/compomics/colims/distributed/io/maxquant/parsers/MaxQuantProteinGroupsParser.java b/colims-distributed/src/main/java/com/compomics/colims/distributed/io/maxquant/parsers/MaxQuantProteinGroupsParser.java index 541d4b08..4e749a03 100644 --- a/colims-distributed/src/main/java/com/compomics/colims/distributed/io/maxquant/parsers/MaxQuantProteinGroupsParser.java +++ b/colims-distributed/src/main/java/com/compomics/colims/distributed/io/maxquant/parsers/MaxQuantProteinGroupsParser.java @@ -52,7 +52,7 @@ public class MaxQuantProteinGroupsParser { * The map of parsed protein sequences (key: protein accession; value: * protein sequence). */ - private Map proteinSequences = new HashMap<>(); + private Map proteinSequences = new HashMap<>(); /** * The quantification method. */ @@ -165,8 +165,8 @@ private ProteinGroup parseProteinGroup(Map proteinGroupsEntry, b strippedAccession = org.apache.commons.lang3.StringUtils.substringAfter(strippedAccession, ProteinGroupHasProtein.CONTAMINANT_PREFIX); } //get the protein sequence by it's accession - String sequence = getProteinSequence(strippedAccession); - proteinGroup.getProteinGroupHasProteins().add(createProteinGroupHasProtein(sequence, accession, isMainGroup, proteinGroup)); + Protein protein = getProteinSequence(strippedAccession); + proteinGroup.getProteinGroupHasProteins().add(createProteinGroupHasProtein(protein, accession, isMainGroup, proteinGroup)); if (isMainGroup) { isMainGroup = false; @@ -208,7 +208,7 @@ private ProteinGroup parseProteinGroup(Map proteinGroupsEntry, b * @return sequence the found sequence * @throws IllegalArgumentException if the accession key is not found */ - private String getProteinSequence(String accession) { + private Protein getProteinSequence(String accession) { if (proteinSequences.containsKey(accession)) { return proteinSequences.get(accession); } else { @@ -224,12 +224,13 @@ private String getProteinSequence(String accession) { * @param mainGroup whether this is the main protein of the group * @return a ProteinGroupHasProtein object */ - private ProteinGroupHasProtein createProteinGroupHasProtein(String sequence, String accession, boolean mainGroup, ProteinGroup proteinGroup) { + private ProteinGroupHasProtein createProteinGroupHasProtein(Protein pr, String accession, boolean mainGroup, ProteinGroup proteinGroup) { ProteinGroupHasProtein proteinGroupHasProtein = new ProteinGroupHasProtein(); proteinGroupHasProtein.setIsMainGroupProtein(mainGroup); //get protein - Protein protein = proteinService.getProtein(sequence); + Protein protein = proteinService.getProtein(pr.getSequence(), pr.getDescription()); + protein.setDescription(pr.getDescription()); //set protein accession proteinGroupHasProtein.setProteinAccession(accession); @@ -262,15 +263,15 @@ private void createProteinGroupQuant(ProteinGroup proteinGroup, AnalyticalRun an //set the analytical run proteinGroupQuant.setAnalyticalRun(analyticalRun); //set the intensity - if (intensity != null) { + if (intensity != null && !Double.isNaN(Double.parseDouble(intensity))) { proteinGroupQuant.setIntensity(Double.parseDouble(intensity)); } //set the LFQ intensity - if (lfqIntensity != null) { + if (lfqIntensity != null && !Double.isNaN(Double.parseDouble(lfqIntensity))) { proteinGroupQuant.setLfqIntensity(Double.parseDouble(lfqIntensity)); } //set the iBAQ - if (ibaq != null) { + if (ibaq != null && !Double.isNaN(Double.parseDouble(ibaq))) { proteinGroupQuant.setIbaq(Double.parseDouble(ibaq)); } //set the MSMS Count diff --git a/colims-distributed/src/main/java/com/compomics/colims/distributed/io/utilities_to_colims/UtilitiesProteinGroupMapper.java b/colims-distributed/src/main/java/com/compomics/colims/distributed/io/utilities_to_colims/UtilitiesProteinGroupMapper.java index 9178f4c4..459e2e1a 100644 --- a/colims-distributed/src/main/java/com/compomics/colims/distributed/io/utilities_to_colims/UtilitiesProteinGroupMapper.java +++ b/colims-distributed/src/main/java/com/compomics/colims/distributed/io/utilities_to_colims/UtilitiesProteinGroupMapper.java @@ -67,7 +67,7 @@ public void map(final ProteinMatch proteinMatch, final PSParameter proteinGroupS //get the utilities Protein from SequenceFactory com.compomics.util.experiment.biology.Protein sourceProtein = SequenceFactory.getInstance().getProtein(proteinAccession); //get protein - Protein matchedProtein = proteinService.getProtein(sourceProtein.getSequence()); + Protein matchedProtein = proteinService.getProtein(sourceProtein.getSequence(), null); if (proteinAccession.equals(proteinMatch.getMainMatch())) { //set the is main protein group flag to true diff --git a/colims-model/src/main/java/com/compomics/colims/model/Protein.java b/colims-model/src/main/java/com/compomics/colims/model/Protein.java index afee47f6..06f37cad 100644 --- a/colims-model/src/main/java/com/compomics/colims/model/Protein.java +++ b/colims-model/src/main/java/com/compomics/colims/model/Protein.java @@ -28,13 +28,17 @@ public class Protein extends DatabaseEntity { @Column(name = "protein_sequence", nullable = false) // @Field(index=Index.YES, analyze=Analyze.NO, store=Store.NO) private String sequence; + + @Basic(optional = true) + @Column(name = "description", nullable = true) + private String description; /** - * The ProteinGroupHasProtein instances from the join table between the protein group and protein tables. + * The ProteinGroupHasProtein instances from the join table between the + * protein group and protein tables. */ @OneToMany(mappedBy = "protein") private List proteinGroupHasProteins = new ArrayList<>(); - /** * No-arg constructor. */ @@ -58,6 +62,14 @@ public void setSequence(String sequence) { this.sequence = sequence; } + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + public List getProteinGroupHasProteins() { return proteinGroupHasProteins; } @@ -66,11 +78,14 @@ public void setProteinGroupHasProteins(List proteinGroup this.proteinGroupHasProteins = proteinGroupHasProteins; } - @Override public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } Protein protein = (Protein) o; diff --git a/colims-repository/src/main/resources/setup/colims_db_setup.sql b/colims-repository/src/main/resources/setup/colims_db_setup.sql index 6904bb86..4b4e0dba 100644 --- a/colims-repository/src/main/resources/setup/colims_db_setup.sql +++ b/colims-repository/src/main/resources/setup/colims_db_setup.sql @@ -243,6 +243,7 @@ create table project_has_user ( create table protein ( id bigint not null auto_increment, protein_sequence longtext not null, + description varchar(200), primary key (id) ); diff --git a/colims-repository/src/main/resources/setup/colims_db_setup_dev.sql b/colims-repository/src/main/resources/setup/colims_db_setup_dev.sql index 212d6374..b4fb73a8 100644 --- a/colims-repository/src/main/resources/setup/colims_db_setup_dev.sql +++ b/colims-repository/src/main/resources/setup/colims_db_setup_dev.sql @@ -246,6 +246,7 @@ create table project_has_user ( create table protein ( id bigint not null auto_increment, protein_sequence longtext not null, + description varchar(200), primary key (id) );