# Introduction to Dochap-Tool
## Abstract:
- Simple tool created for comparing exons and domains between transcripts

## Requeirments:
- Python 3.6

## Installation
**Manually from source**
```
git clone https://github.com/nitzanel/dochap_tool.git
cd dochap_tool
python setup.py install --user
```
**Using pip**
```
pip install dochap_tool
```

# Downloading species data

## Downloading from ucsc

In [None]:
from dochap_tool.ucsc_utils import downloader as ucsc_downloader
# Download ucsc tables of Mus Musculus into the folder 'data'
ucsc_downloader.download_specie_from_ucsc('data', 'Mus_musculus')

## Downloading from ncbi

In [None]:
from dochap_tool.ncbi_utils import downloader as ncbi_downloader
# Download ncbi genbank of Mus Musculus into the folder 'data'
ncbi_downloader.download_specie_from_ncbi('data', 'Mus_musculus')

### Updating data from ncbi

In [None]:
from dochap_tool.ncbi_utils import updater
# Check for updates of ncbi genbank in the folder 'data'
updater.check_for_updates('data')

# Creating the database

### Generating all the tables for a given specie

In [None]:
# create Mus_musculus database
from dochap_tool.db_utils import create_db
# create_db needs a folder for storing the db files, and a specie to create a db for.
# create the database of Mus Musculus in the folder 'data', from the files found in 'data'
create_db.create_db('data', 'Mus_musculus')

### Generating specific tables

In [None]:
# create Mus_musculus knwon gene table in the database
from dochap_tool.db_utils import create_db
# tables is passed as a list, and must contain the strings from:
# 'known_gene', 'genbank', 'alias'
# Here we generate only the known_gene table
create_db.create_db('data', 'Mus_musculus', tables=['known_gene'])

# Querying the database

## Getting aliases of gene symbols

In [None]:
from dochap_tool.compare_utils import compare_exons
from dochap_tool.common_utils import utils
conn = utils.get_connection_object('data', 'Mus_musculus')
with conn:
    symbol = "nfkb2"
    symbols_by_ids = compare_exons.get_gene_aliases_of_gene_symbol(conn, symbol)
    print(utils.format_and_color(symbols_by_ids))

## Getting transcript ids of gene symbols

In [None]:
from dochap_tool.common_utils import utils
from dochap_tool.compare_utils import compare_exons
conn = utils.get_connection_object('data','Mus_musculus')
with conn:
    t_ids1 = compare_exons.get_transcript_ids_of_gene_symbol(conn, 'Xkr5')
    t_ids2 = compare_exons.get_transcript_ids_of_gene_symbol(conn, 'Xkr4')
    t_ids3 = compare_exons.get_transcript_ids_of_gene_symbol(conn, 'noactualgene')
    print('t_ids1:', utils.format_and_color(t_ids1))
    print('t_ids2:', utils.format_and_color(t_ids2))
    print('t_ids3:', utils.format_and_color(t_ids3))

## Getting exons from the database

### By transcript id

In [None]:
from dochap_tool.compare_utils import compare_exons
from dochap_tool.common_utils import utils
exons = compare_exons.get_exons_from_transcript_id('data','Mus_musculus','uc009kzx.3')
print(utils.format_and_color(exons))

### By gene symbol

In [None]:
from dochap_tool.compare_utils import compare_exons
from dochap_tool.common_utils import utils
exons = compare_exons.get_exons_from_gene_symbol('data','Mus_musculus','NFKB2')
print(utils.format_and_color(exons))

## Getting domains from the database

In [None]:
from dochap_tool.compare_utils import compare_exons
from dochap_tool.common_utils import utils
domains = compare_exons.get_domains_of_gene_symbol('data', 'Mus_musculus', 'NFKB2')
print(utils.format_and_color(domains))

# Parsing gtf files

### Loading from file

In [None]:
from dochap_tool.gtf_utils import parser as gtf_parser
from dochap_tool.common_utils import utils
transcripts = gtf_parser.parse_gtf_file('data/gtf_files/transcripts.gtf')
items = [(key, value) for key, value in transcripts.items()]
# show the first 10 transcripts
sub_items = {key: value for (key, value) in items[:2]}
print(utils.format_and_color(sub_items))
interesting_item = transcripts['CUFF.36899.4']
print(utils.format_and_color(interesting_item))

### Getting transcripts of the same genes

In [None]:
from dochap_tool.gtf_utils import parser as gtf_parser
from dochap_tool.common_utils import utils
transcripts = gtf_parser.parse_gtf_file('data/gtf_files/transcripts.gtf')
some_transcripts_ids = ['uc008hst.2']
user_transcripts = gtf_parser.get_transcripts_like_ids(gtf_data, some_transcripts_ids)
print(utils.format_and_color(list(user_transcripts.keys())))
print(utils.format_and_color(user_transcripts))

## Scoring transcripts similarity

In [None]:
from dochap_tool.compare_utils import compare_exons
from dochap_tool.common_utils import utils
from dochap_tool.gtf_utils import parser as gtf_parser
gene_symbol = 'NFKB2'
domains = compare_exons.get_domains_of_gene_symbol('data','Mus_musculus', gene_symbol)
db_transcripts = compare_exons.get_exons_from_gene_symbol('data', 'Mus_musculus', gene_symbol)
user_transcripts = gtf_parser.parse_gtf_file('data/gtf_files/transcripts.gtf')
user_transcript = user_transcripts['CUFF.36899.4']
matches_score = compare_exons.score_matches(user_transcript, db_transcripts)
best_match = compare_exons.get_best_match(user_transcript, db_transcripts)
print(utils.format_and_color(matches_score))
print(utils.format_and_color(best_match))

## Getting intersections between exons and domains

In [None]:
from dochap_tool.compare_utils import compare_exons
from dochap_tool.common_utils import utils
from dochap_tool.gtf_utils import parser as gtf_parser
gene_symbol = 'NFKB2'
domains = compare_exons.get_domains_of_gene_symbol('data','Mus_musculus', 'NFKB2')
user_transcript = gtf_parser.parse_gtf_file('data/gtf_files/transcripts.gtf')['CUFF.36899.4']
db_transcripts = compare_exons.get_exons_from_gene_symbol('data', 'Mus_musculus', gene_symbol)
db_best_match_id = compare_exons.get_best_match(user_transcript, db_transcripts)[0]
db_transcript = db_transcripts[db_best_match_id]
intersections = compare_exons.get_intersections_result(db_transcript, user_transcript, domains)
print(utils.format_and_color(intersections))

# Displaying data and comparing

## Drawing exons and domains

In [None]:
from dochap_tool.draw_utils import draw_tool
from dochap_tool.common_utils import utils
from dochap_tool.gtf_utils import parser as gtf_parser
from dochap_tool.compare_utils import compare_exons
from IPython.core.display import SVG, display
gene_symbol='nfkb2'
exons_variants = compare_exons.get_exons_from_gene_symbol('data','Mus_musculus',gene_symbol)
gtf_data = gtf_parser.parse_gtf_file('data/gtf_files/transcripts.gtf')
user_transcripts = gtf_parser.get_transcripts_like_ids(gtf_data, exons_variants.keys())

#domains_variants = compare_exons.get_domains_of_gene_symbol('data','Mus_musculus',gene_symbol)
svg = draw_tool.draw_combination(gene_symbol, user_transcripts, 'blue', exons_variants, 'purple')
display(SVG(data=svg))