In [2]:
import numpy as np
import pandas as pd

from datetime import datetime
from Questions import Questions
from LLMapi import LLMapi
from DBConnector import DBConnector

from ParsePubMed import ParsePubMed,ExceptionHandler
from PubMedIdTranslator import PubMedIdTranslator

from ipywidgets import IntProgress,Text
from IPython.display import display

In [6]:
llm = LLMapi(URL='http://10.4.24.103',verbose=True)

Connection ok


In [3]:
dbc = DBConnector(database_name='eDNAqua',collection_name='Articles',verbose=True)

Connecting to database mongodb://127.0.0.1 on port 27017...Client connected
Connecting to database "eDNAqua"
Connected
Selecting collection "Articles"
OK


# Read the data

### Questions

In [7]:
questions = Questions()
for i,q in enumerate(questions._questions):
    print(f'Q{i+1}: {q}')

Q1: Are the data in this paper environmental? Only "yes" or "no".
Q2: Are the data available in paper or supplement? Only "paper" or "suplement".
Q3: What is the sample collection method?
Q4: What is the DNA extraction method?
Q5: What is the source of the protocol in protocols.io?
Q6: What is the overall sequencing strategy used in experiment?
Q7: What is the sequence analysis workflow?
Q8: Where is the data stored?
Q9: What is the marker name used in experiment?
Q10: What is the reference database used for taxonomical identification?


### Responses

In [4]:
resp = list(dbc.collection.find())

In [8]:
with open('LLM_result_database.json','w') as json:
    json.write(str(resp))

### Propositions of summaries

In [29]:
summarizes = {
    'Q1':None,
    'Q2':None,
    'Q3':'Basing on provided list write overall sample collection method.',
    'Q4':'Basing on provided list write overall DNA extraction method.',
    'Q5':None,
    'Q6':'Basing on provided list write overall sequencing strategy used in experiments.',
    'Q7':'Basing on provided list write overall sequence analysis workflow.',
    'Q8':None,
    'Q9':None,
    'Q10':None
}

# Question 2

In [19]:
questions.Q2

'Are the data available in paper or supplement? Only "paper" or "suplement".'

In [10]:
resp_q2 = ['both' if 'both' in r[f'Q2'] else ('paper' if 'paper' in r['Q2'] else 'supplement') for r in resp if f'Q{i}' in r.keys()]

In [18]:
print('|Where the data are?|Number of papers|\n|-:|-:|')
for val,count in np.transpose(np.unique(resp_q2,return_counts=True)):
    print(f'|{val}|{count}|')

|Where the data are?|Number of papers|
|-:|-:|
|both|66|
|paper|459|
|supplement|915|


|Where the data are?|Number of papers|
|-:|-:|
|both|66|
|paper|459|
|supplement|915|

In [391]:
66+459+915

1440

# Question 3

In [20]:
questions.Q3

'What is the sample collection method?'

In [36]:
resp_q3 = '\n\n'.join([r['Q3'] for r in resp if 'Q3' in r.keys() and len(r['Q3']) > 150])
resp_q3 = '\n\n'.join([r for r in resp_q3.split('\n\n') if len(r) > 150])

#### Liczba artykułów, na których można bazować:

In [39]:
len(resp_q3.split('\n\n'))

1182

In [40]:
answer_q3 = llm.ask(summarizes['Q3'],resp_q3)['Answer']

In [42]:
print(answer_q3)

Based on the provided list, the overall sample collection method is not explicitly stated, but it can be inferred that the samples were collected through various methods, including:

1. Surveys and interviews
2. Online postings and updates
3. Filtration devices (VigiBOAT, SPYGEN)
4. Microscopic examination
5. Kick seining
6. Fishing using a boat-mounted electrofishing unit
7. Collaboration with ongoing national and international monitoring programs of fish and seals
8. Flowthrough columns packed with sieved sediment
9. Windshield splatter analysis

The specific sample collection method used in each study is not explicitly mentioned in the text, but it can be inferred based on the context and the information provided.


# Question 4

In [43]:
questions.Q4

'What is the DNA extraction method?'

In [52]:
resp_q4 = '\n---\n'.join([r['Q4'] for r in resp if 'Q4' in r.keys() and len(r['Q4']) > 150])
# resp_q3 = '\n\n'.join([r for r in resp_q3.split('\n\n') if len(r) > 150])

#### Liczba artykułów, na których można bazować:

In [54]:
len(resp_q4.split('\n---\n'))

946

In [55]:
answer_q4 = llm.ask(summarizes['Q4'],resp_q4)['Answer']

In [57]:
print(answer_q4)

Based on the provided list, the overall DNA extraction method can be described as follows:

The DNA extraction method used in the study involves the following steps:

1. Sample Preparation: The sample is prepared by homogenizing or grinding the tissue using a mortar and pestle, blender, or other similar tools.
2. Lysis: The sample is then treated with a lysis buffer containing detergents and/or enzymes to break down the cell membranes and release the DNA.
3. DNA Purification: The released DNA is then purified using a variety of techniques, such as centrifugation, filtration, or precipitation with ethanol or isopropanol.
4. Quantification: The quantity of DNA is measured using a spectrophotometer or fluorescence-based assay to determine the amount of DNA available for analysis.
5. PCR Setup: The purified DNA is then ready for PCR amplification, which involves the use of primers specific to the target gene regions to amplify the DNA sequences of interest.

It's worth noting that differen

# Question 5

In [61]:
questions.Q5

'What is the source of the protocol in protocols.io?'

In [71]:
protocols = list()
for l in [[ref for ref in r['references'].split(',') if 'protocols' in ref] for r in resp if 'references' in r.keys()]:
    protocols += l

In [73]:
protocols = [[ref for ref in r['references'].split(',') if 'protocols' in ref] for r in resp if 'references' in r.keys()]

In [76]:
# protocols[np.argwhere(np.array([len(p) for p in protocols])>0).flatten()]
np.sum(np.array([len(p) for p in protocols])>0)

0

Do głębszego zastanowienia. Być może trzeba będzie jeszcze raz przetworzyć dane crawlerem pobierając pełne referencje dla wyodrębnienia doi z protocols.io.

In [396]:
dois = [r['DOI'] for r in resp]

In [400]:
dois[0]

'10.1371/journal.pone.0238557'

In [None]:
references = list()
dp_text = Text()
dp_bar = IntProgress(min=0,max=len(dois))
display(dp_text)
display(dp_bar)
for doi in dois:
    dp_bar.value += 1
    dp_text.value = f'{dp_bar.value}/{len(dois)} ({dp_bar.value/len(dois)*100:.2f} %)'
    try:
        pmid = PubMedIdTranslator.DOItoPubMed(doi)
        if pmid is None or pmid == 0:
            raise ValueError('')
        parser = ParsePubMed(pmid,False)
    except ValueError:
        print(f'{doi}: Cannot process given ID')

    references += [doi for doi in [PubMedIdTranslator.PubMedtoDOI(pmid) for pmid in parser.References] if doi is not None]


Text(value='')

IntProgress(value=0, max=1607)

10.1098/rsbl.2008.0118: Cannot process given ID
10.1093/nar/24.16.3189: Cannot process given ID
10.1098/rstb.2003.1447: Cannot process given ID
10.1128/AEM.01240-10: Cannot process given ID
10.1098/rstb.2004.1573: Cannot process given ID
10.1128/AEM.01298-08: Cannot process given ID
10.1128/AEM.02720-06: Cannot process given ID
10.1073/pnas.0605127103: Cannot process given ID
10.1101/gr.112730.110: Cannot process given ID
10.1073/pnas.0503123102: Cannot process given ID
10.1073/pnas.1013332108: Cannot process given ID
10.1073/pnas.0706905105: Cannot process given ID
10.1128/AEM.07192-11: Cannot process given ID
10.1093/molbev/msr121: Cannot process given ID
10.1101/gr.849004: Cannot process given ID
10.1098/rspb.2020.2424: Cannot process given ID
10.1073/pnas.0707157105: Cannot process given ID
10.1093/nar/gkh340: Cannot process given ID
10.1128/AEM.71.12.8228-8235.2005: Cannot process given ID
10.1093/bioinformatics/btt434: Cannot process given ID
10.1093/bioinformatics/btu044: Cannot

In [418]:
pd.Series(np.unique(references)).apply(lambda x:'protocols' in x).sum()

0

In [420]:
content = [r['content'] for r in resp]

In [424]:
content = pd.Series(content)

In [425]:
protocols = content[content.apply(lambda x: 'protocols.io' in x)]

In [427]:
for c in protocols:
    index = c.find('protocols.io') + 12
    start = index
    while c[start:start+2] != '. ' and start > 0:
        start -= 1
    start += 2
    stop1 = c.find('. ',index)
    stop2 = c.find('.\n',index)
    stop = min(stop1,stop2)
    print(c[start:stop])

The complete protocol is published on protocols.io (https://dx.doi.org/10.17504/protocols.io.n2udgew)
Our aquatic eDNA methods are posted online at protocols.io site (https://dx.doi.org/10.17504/protocols.io.p9gdr3w)
All data generation methods are detailed in protocols.io (dx.doi.org/10.17504/protocols.io.u6zezf6)
The full protocol used to create eukaryotic SSU rRNA metabarcoding libraries can be found online at protocols.io: dx.doi.org/10.17504/protocols.io.hdmb246
The two-step-PCR strategy for COI amplicon library preparation results in double-uniquely indexed libraries obtained using broad-spectrum BF3-BR2 primers with variable-length inserts (phased), reducing cross-contamination through index hopping and increasing signal complexity within the sequencing lane, thus translating to higher quality of results.
Materials and methods
The protocol described in this article is published on protocols.io https://www.protocols.io/private/C609E2107CD8B7CFF46EFF1461DBE4C3 and is included for 

# Question 6

In [78]:
questions.Q6

'What is the overall sequencing strategy used in experiment?'

In [79]:
resp_q6 = '\n---\n'.join([r['Q6'] for r in resp if 'Q6' in r.keys() and len(r['Q6']) > 150])
# resp_q3 = '\n\n'.join([r for r in resp_q3.split('\n\n') if len(r) > 150])

In [81]:
print(resp_q6)

Based on the information provided in the text, the overall sequencing strategy used in the experiment is as follows:

1. DNA extraction from Sterivex filters using the DNAeasy Tissue and Blood Kit (Qiagen Inc.) with modifications.
2. PCR amplification of the extracted eDNA using the MiFish Universal Teleost 12S primer (Miya et al., 2015) with Nextera modifications.
3
---
Based on the provided document, the overall sequencing strategy used in the experiment is Massively Parallel Sequencing (MPS) using the MiSeq platform. The experiment involves two PCR amplifications, followed by indexing and pooling of the libraries for sequencing. The first PCR amplifies the target region using primers specific to the primer binding sites, while the second PCR amplifies the same region using primers specific to the dual-index sequences (40 unique indices in
---
Based on the provided context, the overall sequencing strategy used in the experiment is:

1. DNA extraction from aquarium water samples using

#### Liczba artykułów, na których można bazować:

In [82]:
len(resp_q6.split('\n---\n'))

1425

In [83]:
answer_q6 = llm.ask(summarizes['Q6'],resp_q6)['Answer']

In [85]:
answer_q6_a = llm.ask('Extract some stats on the steps made in experiments from provided list',resp_q6)['Answer']

In [88]:
answer_q6_b = llm.ask('List the DNA extraction methods mentioned on the provided list.',resp_q6)['Answer']

In [94]:
answer_q6_c = llm.ask('List the sequencing devices mentioned on the provided list.',resp_q6)['Answer']

In [None]:
answer_q6_d = llm.ask('Extract all the tools and devices used in experiments from provided list. List must include the counts of use of this things.',resp_q6)['Answer']

In [84]:
print(answer_q6)

Based on the provided list, the overall sequencing strategy used in the experiments is not explicitly mentioned. However, we can infer that the experiments involve the analysis of high-throughput sequencing data, as the text mentions "reads" and "fastq files." Additionally, the text mentions "paired-end" reads, which suggests that the sequencing data was generated using a paired-end sequencing protocol. Therefore, the overall sequencing strategy used in the experiments is likely a combination of high-throughput sequencing and paired-end sequencing.


In [86]:
print(answer_q6_a)

1. DNA extraction: 3
                    2. PCR amplification: 3
                    3. Sequencing: 6
                    4. Data processing: 4
                    5. Downsampling: 1
                    6. Reference labels: 1
                    7. Taxon assignment: 1
                    8. Data analysis: 3
                    
                    Note: The numbers in parentheses represent the number of times each step appears in the list of experiments.


In [89]:
print(answer_q6_b)

Based on the provided list, the following DNA extraction methods are mentioned:

1. Cross-flow filtration capsule and CL1 conservation buffer.
2. NucleoSpin® Soil (MACHEREY-NAGEL GmbH & Co., Düren, Germany).
3. Modified Bligh and Dyer method.
4. Initial 'preamplification' to increase the concentration of the target loci relative to other DNA.
5. PCR amplification using specific primers.


In [95]:
print(answer_q6_c)

Based on the provided list, the following sequencing devices are mentioned:

1. Illumina MiSeq
2. PacBio
3. Illumina HiSeq
4. Pyrosequencing
5. Next-generation sequencing technologies

Please note that the list does not explicitly mention any specific models or versions of these sequencing devices.


In [103]:
print(answer_q6_d)

Based on the provided list of documents, the following tools and devices were used in the experiments:

1. DNA extraction kits:
	* DNeasy PowerWater Kit (Qiagen): 3 documents
	* DNeasy Blood & Tissue Extraction Kit (Qiagen): 1 document
	Total count: 4 documents
2. PCR amplification:
	* Primers for 16S rDNA V3 hypervariable region: 2 documents
	* Primers for 18S rDNA V9 hypervariable region: 1 document
	Total count: 3 documents
3. Sequencing technologies:
	* Illumina MiSeq: 2 documents
	* PacBio: 1 document
	* Ion Torrent Personal Genome Machine (PGM): 1 document
	Total count: 4 documents
4. DNA barcoding techniques:
	* DNA barcode container: 1 document
	* Barcode ITS2 or ITS1: 1 document
	Total count: 2 documents
5. High-throughput sequencing (HTS) records:
	* HTS experimental records: 1 document
	Total count: 1 document
6. Library preparation:
	* PCR amplification of a approximately 250-base pair fragment of 12S using specific primers: 1 document
	Total count: 1 document
7. Sampling i

# Question 7

In [104]:
questions.Q7

'What is the sequence analysis workflow?'

In [105]:
resp_q7 = '\n---\n'.join([r['Q7'] for r in resp if 'Q7' in r.keys() and len(r['Q7']) > 150])
# resp_q3 = '\n\n'.join([r for r in resp_q3.split('\n\n') if len(r) > 150])

In [106]:
print(resp_q7)

Based on the provided text, the sequence analysis workflow includes the following steps:

1. Quality control and trimming of raw reads using the Anacapa Toolkit.
2. Amplicon sequence variant (ASV) parsing and taxonomic assignment using the Anacapa Toolkit and custom reference databases.
3. Assignment of taxonomy using the FishCARD California fish specific reference database and the CRUX-generated 12S reference database supplement
---
The sequence analysis workflow involves several steps:

1. Data pre-processing: The raw sequencing data is cleaned and filtered to remove low-quality reads and primer sequences.
2. BLAST search: The cleaned reads are compared to a database of known fish mitochondrial DNA sequences using BLAST to identify the species of origin.
3. Species assignment: The BLAST search results are used to assign a species label to each read.
4.
---
Based on the provided document, the sequence analysis workflow includes the following steps:

1. DNA extraction from seawater sam

#### Liczba artykułów, na których można bazować:

In [107]:
len(resp_q7.split('\n---\n'))

1438

In [108]:
answer_q7 = llm.ask(summarizes['Q7'],resp_q7)['Answer']

In [110]:
print(answer_q7)

The overall sequence analysis workflow can be inferred as follows:

1. Preprocessing: The raw sequencing data is preprocessed to remove low-quality reads, filter out primer sequences and adapter contamination, and extract the relevant features.
2. Feature extraction: The remaining high-quality reads are then converted into feature matrices, where each sample is represented as a vector of features.
3. Similarity calculation: The similarity between each pair of sequences is calculated based on their features, using techniques such as Euclidean distance or cosine similarity.
4. Clustering: The sequences are then grouped into clusters based on their similarities, using techniques such as k-means or hierarchical clustering.
5. Visualization: The results of the clustering are then visualized using techniques such as scatter plots or dendrograms to identify patterns and trends in the data.
6. Interpretation: The results of the analysis are then interpreted and communicated to stakeholders, fo

In [118]:
answer_q7_a = llm.ask('What bioinformatic tool were used in these experiments and in which step of analysis were they used. List them.',resp_q7)['Answer']

In [119]:
print(answer_q7_a)

Based on the provided documents, the following bioinformatic tools were used in the experiments and at which step of analysis:

1. Document 1:
	* Step 1: Sample Preparation
		+ Bioinformatic tool: Geneious software
		+ Function: Quality filtering, trimming, and adapter removal
	* Step 3: Library Preparation
		+ Bioinformatic tool: Illumina sequencing platform
		+ Function: Sequencing the prepared libraries
	* Step 5: Data Analysis
		+ Bioinformatic tool: DESeq2
		+ Function: Quantifying gene expression and identifying differentially expressed genes
2. Document 2:
	* Step 2: Feature Extraction
		+ Bioinformatic tool: RNA-seq by read count
		+ Function: Identifying which genes are expressed, and at what levels
	* Step 4: Pathway Analysis
		+ Bioinformatic tool: DAVID or Reactome
		+ Function: Inferring biological pathways and networks that are active in the sample being studied
3. Document 3:
	* Step 1: Preprocessing
		+ Bioinformatic tool: Coulter Counter LS 100™ Particle Size Analyser


# Question 8

In [120]:
questions.Q8

'Where is the data stored?'

In [149]:
resp_q8 = '\n---\n'.join([r['Q8'] for r in resp if 'Q8' in r.keys() and (len(r['Q8']) > 150 or len(r['Q8']) < 10)])
# resp_q3 = '\n\n'.join([r for r in resp_q3.split('\n\n') if len(r) > 150])

In [150]:
len(resp_q8.split('\n---\n'))

952

In [122]:
print(resp_q8)

The data is stored in the following locations:
                        - Document(page_content='...We dried Sterivex filters using a 3 mL syringe and then capped and stored the filters at -20˚C for DNA laboratory work back at UCLA...')
                        - Document(page_content='...All PCRs included a negative control where molecular grade water replaced the DNA extraction...')
                        - Document(page_
---
Based on the text, the data is stored in a custom-made database that was created by downloading whole and partial fish mitogenome sequences from MitoFish and whole mitogenome sequences from tetrapods from NCBI Organelle Genome Resources. Additionally, the database was supplemented by assembling new sequences in M.M.'s laboratory. As of October 4th, 2014, the database covers approximately 4230 fish species distributed
---
Based on the content of the text, it appears that the data is stored in a document or a set of documents, possibly in a digital format such as a

In [151]:
answer_q8 = llm.ask('List all the databases where researchers stored the sequencing data from experiment basing on provided list. Get also the accession numbers for them if it is possible.',resp_q8)['Answer']

In [152]:
print(answer_q8)

Based on the provided list, the researchers stored the sequencing data from the experiment in the following databases:

1. GenBank (accession numbers: KM282400, KM282461, KM434930, KM435002, KM523268)
2. Barcode of Life Database (BOLD: accession numbers: KM273814, KM282406, KM282467, KM4349)
3. MEGAN (Huson et al.,)
4. CAMERA
5. Genome-to-Genome Distance Calculator
6. Delaware Bay Operational Forecast System
7. Sequence Read Archive study ERP004168
8. CDD database
9. Metacyc database
10. vegan package
11. Phylosift

Note that some of the accession numbers are not available in the provided list, as they are only mentioned in the text as "documents" or "files" without providing any specific accession numbers.


In [153]:
answer_q8_a = llm.ask('List all the databases where researchers stored the sequencing data from experiment basing on provided list. Count number of use of these databases',resp_q8)['Answer']

In [154]:
print(answer_q8_a)

Based on the provided list, the researchers stored the sequencing data from experiment in the following databases:

1. GenBank (mentioned twice)
2. JGI gene object ID (mentioned once)
3. BOLD (mentioned once)
4. MEGAN (mentioned once)

Therefore, there are 4 databases where the researchers stored the sequencing data from experiment.


In [156]:
len([r for r in resp_q8.split('\n---\n') if 'genbank' in r.lower() or 'gen bank' in r.lower()])

81

In [157]:
len([r for r in resp_q8.split('\n---\n') if 'bold' in r.lower() or 'barcode of life' in r.lower()])

37

In [162]:
[r for r in resp_q8.split('\n---\n') if 'uploaded' in r]

['The raw sequence data was uploaded to the NCBI Sequence Read Archive (SRA) as BioProject ID PRJNA704795, BioSample IDs SAMN18055833 –41 and accession numbers SRR13781971–SRR13782030.',
 'The raw sequence reads generated in this study have been uploaded to GenBank NCBI Sequence Read Archive under BioProject PRJNA673533 (SRR15093454-SRR15093473).',
 'The raw sequence data (Riaz and Teleo amplicons) was uploaded to the Sequence Read Archive (SRA) of NCBI under BioProject no. PRJNA616325. The demultiplexing script as well as a shell script used for all analyses described above are available at Zenodo (https://doi.org/10.5281/zenodo.3731310). The',
 "Based on the text, the data is stored in the following locations:\n\n1. Document(page_content='Insights into Microbial Ecology 2 (QIIME2) pipeline v.2019.7 (Bolyen et al.,\\xa0).')\n2. Document(page_content='BARCODE OF LIFE DATABASE (BOLD)')\n3. Database of TRNL sequences created from sequences uploaded to the NCBI database.\n4. Reference dat

In [172]:
q8_stored = [i for i in range(len(content)) if 'stored' in content[i]]

In [253]:
phrase = 'uploaded to'
for k,i in enumerate([i for i in range(len(content)) if phrase in content[i]]):
    c = content[i]
    word_index = c.find(phrase)
    start = word_index
    while c[start:start + 2] != '. ' and start > 0:
        start -= 1
    start = start + 2 if start > 0 else start
    stop = c.find('.',word_index)
    print(k,c[start:stop],'\n---')

0 Sequence files were sorted into separate files, by MID and primer pair, allowing 0 mismatches in the MID and up to 2 in each primer.
Sequences from pyrosequencing are uploaded to NCBI SRA: ERP001563 
---
1 The LotusS2 pipeline offers the advantage of retaining some discarded sequence data when possible with read backmapping and seed extension steps, since using an excessively strict read filter can decrease sensitivity for low-abundance amplicons in the air column by artificially reducing sequencing depth.
Bioinformatics pipeline for plant ITS2 amplicons
Demultiplexed paired end FASTQ files were uploaded to the Multiplex Barcode Research And Visualization Environment (mBRAVE) platform () for QC trimming, filtering, paired end merging, and OTU bin assignment using the following parameters to maximize information content: trim front: 50 bp, trim end: 50 bp, trim length: 550 bp, min QV: 0, min length: 100 bp, max bases with low QV (<20): 75 
---
2 Raw sequence data were uploaded to the 

In [234]:
phrase = 'Unite'
len([i for i in range(len(content)) if phrase in content[i] and 'UNITE' not in content[i]])

312

In [248]:
phrase = 'UNITE'
for k,i in enumerate([i for i in range(len(content)) if phrase in content[i]]):
    c = content[i]
    word_index = c.find(phrase)
    start = word_index
    while c[start:start + 2] != '. ' and start > 0:
        start -= 1
    start = start + 2 if start > 0 else start
    stop = c.find('.',word_index)
    sentence = c[start:stop]
    if 'identifi' not in sentence.lower() and 'reference' not in sentence.lower() and 'classified' not in sentence.lower() and 'taxonomic' not in sentence.lower() and 'against the' not in sentence.lower():
        print(k,sentence,'\n---')

21 The Naïve Bayesian Classifier combined with the UNITE INSD mostly provided a high bootstrap support for these assignments, albeit rarely at the subgeneric levels (Supplemental Table S1) 
---
22 The “assignTaxonomy” function in the DADA2 pipeline was used for assigning amplicon sequence variants (ASVs) to specific sequences in the UNITE fungal database v 
---
25 The ‘dataGeneralizations’ field was used to indicate the identity of OTUs towards the UNITE species hypothesis concept, Sampling sites were included as WKT polygons in the ‘footprintWKT’ field and sampling site names were included in the ‘eventID’ field 
---
27 The taxonomy of ASV was assigned at a 99% sequence identity based on the UNITE v7 database 
---
28 OTUs were picked using the BLAST method and the UNITE dynamic database released on February 2, 2014 (http://unite 
---
30 For fungi and bacteria, we used DADA2 assignTaxonomy() and the publicly available databases UNITE general fasta release 9 
---
31 To characterize the 

In [271]:
phrase = 'Figshare'
for k,i in enumerate([i for i in range(len(content)) if phrase in content[i]]):
    c = content[i]
    word_index = c.find(phrase)
    start = word_index
    while c[start:start + 2] != '. ' and c[start:start + 2] != '.\n' and start > 0:
        start -= 1
    start = start + 2 if start > 0 else start
    stop = c.find('.',word_index)
    sentence = c[start:stop]
    if 'upload' in sentence.lower() or 'store' in sentence.lower() or 'deposit' in sentence.lower():
        print(k,sentence,'\n---')

2 The OTU table, ITS sequence of each OTU, and corresponding metadata are deposited in Figshare 
---
3 The pesticide concentrations, sedimentation rates, and inferred dates can be consulted in the file Stechlin_organohalogene.csv, deposited in Figshare (https://figshare 
---
5 Reference databases for RDP classifier and UTAX were built from the rbcL sequences using the method described in for training of the ITS2 databases; these were then deposited on Figshare, along with the FASTA file described above (https://dx 
---
6 All alignments have been uploaded to Figshare (https://doi 
---
8 Demultiplexed, trimmed, and merged reads (QIIME ready) have also been deposited on Figshare (https://doi 
---


In [None]:
dbs = ['GenBank','BOLD','mBRAVE','Atlas of Living Australia','Mendeley','NCBI','SRA',
        'GitHub','Sequence Read Archive','Figshare','UNITE','Fig share']

In [272]:
711+7+1+1+2+3+5+1

731

|Database|Number of papers|
|:-|:-|
|GenBank|711
|BOLD|7
|mBRAVE|1
|Atlas of Living Australia|1
|Mendeley cloud|2
|GitHub|3
|Figshare|5
|UNITE|1
|**Sum**|731


# Question 9

In [273]:
questions.Q9

'What is the marker name used in experiment?'

In [281]:
resp_q9 = '\n---\n'.join([r['Q9'] for r in resp if 'Q9' in r.keys()])
# resp_q3 = '\n\n'.join([r for r in resp_q3.split('\n\n') if len(r) > 150])

In [282]:
len(resp_q9.split('\n---\n'))

1440

In [276]:
print(resp_q9)

The marker name used in the experiment is not explicitly mentioned in the text. However, based on the description of the experiment, it appears that the researchers used a set of primers specifically designed for metabarcoding eDNA from natural environments with unknown fish composition and abundances in an open ecosystem. These primers were used to target the V4 region of the 16S rRNA gene, which is a commonly used marker for environmental DNA analysis.
---
Based on the text, the marker names used in the experiment are:
                        - trnL (UAA) intron
                        - 12S gene
                        - 16S gene
---
Based on the text, the marker name used in the experiment is "MiFish Universal Teleost 12S primer set" and "MiFish Universal Elasmobranch 12S primer set".
---
Based on the text, the marker name used in the experiment is "mlCOIintF" and "jgHCO2198". These are universal primers modified with a PGM sequencing adaptor, barcodes, and a "GAT" spacer.
---
Base

In [283]:
answer_q9 = llm.ask('List all barcoding markers mentioned on provided list',resp_q9)['Answer']

In [284]:
print(answer_q9)

Based on the provided list, the following barcoding markers are mentioned:
                        - COI
                        - 12S
                        - 16S
                        - ITS1
                        - ITS2
                        - rbcL
                        - ssu
                        - sucrose
                        - pHNBS
                        - ALT
                        - 18S
                        - V8
                        - 50 mL clarified V8 juice
                        - 12S-rDNA
                        - 16S-rDNA
                        - 18S-rDNA
                        - rsf
                        - R0100
                        - C0100
                        - C081.42
                        - C083.26
                        - C079.90
                        - C086.95
                        - C021.84
                        - C022.04
                        - C023.55
                        - C019.85
                        - pan trapp

In [285]:
markers = '''16S
trnL
12S
COI
CO2
rbcL
18S
cytB
ITS1
ITS2
28S
P1
rpoC1
rpoB
matK
trnH-psbA
atpF-atpH
psbK-psbI
ITS4
MiFish-U
MiFish-E
Riaz
TW13
ITS
PDMPO
LSU
MCM7
RPB2
TEF1
23S
32P-dCTP
SSU
ITS5
pHNBS
ALT'''.split()

In [298]:
markers_c = pd.Series()
for marker in markers:
    rest = list(set(markers).difference(set([marker,])))
    markers_c[marker] = len([c for c in resp_q9.split('\n---\n') if marker in c])

  markers_c = pd.Series()


In [299]:
markers_c

16S          147
trnL          34
12S           46
COI          165
CO2            6
rbcL          38
18S          122
cytB           1
ITS1          28
ITS2          71
28S            5
P1             2
rpoC1          1
rpoB           1
matK           5
trnH-psbA      2
atpF-atpH      1
psbK-psbI      1
ITS4          14
MiFish-U       5
MiFish-E       2
Riaz           2
TW13           1
ITS          126
PDMPO          1
LSU            7
MCM7           1
RPB2           1
TEF1           2
23S            3
32P-dCTP       1
SSU           21
ITS5           2
pHNBS          1
ALT            1
dtype: int64

# Question 10

In [300]:
questions.Q10

'What is the reference database used for taxonomical identification?'

In [301]:
resp_q10 = '\n---\n'.join([r['Q10'] for r in resp if 'Q10' in r.keys()])
# resp_q3 = '\n\n'.join([r for r in resp_q3.split('\n\n') if len(r) > 150])

In [304]:
len(resp_q10.split('\n---\n'))

1440

In [302]:
print(resp_q10)

According to the text, the reference database used for taxonomical identification is the "CRUX-generated 12S reference database supplemented with FishCARD reference sequences."
---
The reference database used for taxonomical identification is a custom-made database created by downloading whole and partial fish mitogenome sequences from MitoFish and whole mitogenome sequences from tetrapods from NCBI Organelle Genome Resources. Additionally, the database was supplemented by assembling new sequences in the author's laboratory. As of 4 October 2014, the database covers approximately 4230 fish species distributed across 457
---
Based on the text, the reference database used for taxonomical identification is the GenBank Nucleotide database.
---
Based on the content of the text, the reference database used for taxonomical identification is GenBank.
---
Based on the text, the reference database used for taxonomical identification is "DNA barcodes that enable species identiﬁcation".

Note: DNA

In [313]:
answer_q10 = llm.ask('From every position ont the list get the name of the database.',resp_q10)['Answer']

In [314]:
print(answer_q10)

Sure! Here are the names of the databases mentioned in the list:

1. FishBase
2. GenBank
3. SILVA release v132 references alignment
4. NCBI Taxonomy
5. Greengenes database
6. complete NCBI nucleotide database
7. local plant collection
8. public sequence database
9. custom COI fish database built with fish COI sequences mined from GenBank and Bold
10. Kelpie in silico polymerase chain reaction output fastq files
11. GenBank data set


In [317]:
resp_q10_split = resp_q10.split('\n---\n')

In [358]:
for i,r in enumerate(resp_q10_split):
    if i not in indices and i not in indices_no and i not in indices2:
        print(r,'\n---')

The reference database used for taxonomical identification varies for each tool. Some tools use a custom database, while others use publicly available databases such as the NCBI taxonomy or the SILVA ribosomal RNA database. The specific reference database used for each tool is as follows:

* MEGAN: RefSeq database ver. 66
* MetaPhlAn: Marker set based on clade-specific sequences
* MetaPhyler: 
---
12S rRNA sequences of chordates downloaded from GenBank and supplemented with 12S rRNA sequences of New Zealand native fishes. 
---
Based on the provided context, the reference database used for taxonomical identification in SLIM is the Greengenes database. 
---
Comprehensive and well-curated reference databases are critical for reliable identifications in the context of DNA sequencing and taxonomical identification. However, many such databases remain incomplete and can be difficult to curate. Therefore, new bioinformatic approaches make use of reference-free identification algorithms, emplo

In [375]:
text_rdb = list()
for i,r in enumerate(resp_q10_split):
    if i not in indices and i not in indices_no and i not in indices2:
        text_rdb.append(r)

In [376]:
hand_rdb = '''NCBI RefSeq
GreenGene
BOLD
RDP
GenBank
custom
EMBL
RDP
MIDORI
NemaBase
SILVA
UNITE
NCBI nt
DDBJ
μgreen
aquaDNA
CIBIO-IBI
FishCARD
MitoFish
INSC
Fishbase
BIOCODE
European Nucleotide Archive
PDB
BioAir
MiFish
Diat.barcode
MBIJ
PR2
IUCN
AlgaeBase
DDBJ
MidoFish
MG-RAST
GBIF
MMETSP
PhytoREF
Claident
Protax
WRB
EukRibo
UniProt
HADB
DS-POTAM
Integrated Microbial Ecology
GENCODE
Countryside Survey vegetation
CBD
ITSone
TAIR
NatureServe
NADED
PLANTiTS
db-COI_MBPK
UniPlant
UniProtKB
ISI'''.split('\n')
hand_rdb = np.unique(hand_rdb)

In [347]:
len('reference database used for taxonomic identification is')

55

In [352]:
indices_no = [i for i,r in enumerate(resp_q10_split) if 'there is no' in r.lower()]

In [356]:
len(indices_no) + len(indices) + len(indices2)

1410

In [357]:
len(indices_no), len(indices), len(indices2)

(47, 1344, 19)

In [339]:
indices = [i for i,r in enumerate(resp_q10_split) if 'reference database used for taxonomical identification is' in r]
ref_dbs = list()
for index in indices:
    r = resp_q10_split[index]
    start = r.find('reference database used for taxonomical identification is') + 58
    start = r.find(' ',start) + 1
    ref_dbs.append(r[start:])

In [348]:
indices2 = [i for i,r in enumerate(resp_q10_split) if 'reference database used for taxonomic identification is' in r]
ref_dbs2 = list()
for index in indices2:
    r = resp_q10_split[index]
    start = r.find('reference database used for taxonomic identification is') + 56
    start = r.find(' ',start) + 1
    ref_dbs2.append(r[start:])

In [360]:
ref_dbs

['"CRUX-generated 12S reference database supplemented with FishCARD reference sequences."',
 "custom-made database created by downloading whole and partial fish mitogenome sequences from MitoFish and whole mitogenome sequences from tetrapods from NCBI Organelle Genome Resources. Additionally, the database was supplemented by assembling new sequences in the author's laboratory. As of 4 October 2014, the database covers approximately 4230 fish species distributed across 457",
 'GenBank Nucleotide database.',
 'Based on the content of the text, the reference database used for taxonomical identification is GenBank.',
 'barcodes that enable species identiﬁcation".\n\nNote: DNA barcoding is a technique used to identify species based on a short DNA sequence, called a barcode, that is unique to each species. This approach allows for rapid and accurate identification of species, which is particularly useful for studying biodiversity and monitoring the presence of species in different environmen

In [377]:
all_dbs = pd.Series(['BOLD'])
for rd in ref_dbs + ref_dbs2 + text_rdb:
    for hrd in hand_rdb:
        if hrd.lower() in rd.lower():
            all_dbs[all_dbs.size] = hrd


In [387]:
all_dbs_df = pd.DataFrame(np.unique(all_dbs,return_counts=True)).T
all_dbs_df.index = all_dbs_df[0]
all_dbs_df.drop([0],inplace=True,axis=1)
all_dbs_df.columns = ['counts',]

In [389]:
all_dbs_df.to_excel('rdb_counts.xlsx')

In [390]:
len(resp)

1607