<a href="https://colab.research.google.com/github/evolu-tion/Comparative-genomics/blob/main/Genome_conversion_using_comparative_genomic_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tutorial for genome conversion using comparative genomic approach

The tutorial includes x steps, including
1. Download and install packages/tools and genomic information preparation
2. Finding orthologus proteins using BLASTp
3. Processing data based on DBH BLASTp

# 1. Download and install packages/tools and genomic information preparation

In [1]:
# Install required python packages
!pip install pandas
!pip install seaborn
!pip install plotly==5.3.1

Collecting plotly==5.3.1
  Downloading plotly-5.3.1-py2.py3-none-any.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 14 kB/s 
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-5.3.1 tenacity-8.0.1


In [2]:
# Install NCBI-BLAST packages
!mkdir -p required_package
!wget --quiet https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.12.0+-x64-linux.tar.gz -O required_package/ncbi-blast-2.12.0+-x64-linux.tar.gz
!tar xzf required_package/ncbi-blast-2.12.0+-x64-linux.tar.gz --directory required_package
!mv required_package/ncbi-blast-2.12.0+/bin .

!bin/blastp -version

blastp: 2.12.0+
 Package: blast 2.12.0, build Jun  4 2021 03:22:54


Download genomics data including whole protein sequence and annotation of species A and B

In [3]:
# Mount drive to shared drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Download species/genome version A and B
!mkdir -p input
!cp "/content/drive/Shareddrives/BML/Data-resource/For-research/Omics/Genome/Manihot esculenta/AM560_v8/Genome_annotation/Database/Phytozome/Mesculenta_671_v8.1.protein.fa.gz" input
!cp "/content/drive/Shareddrives/BML/Data-resource/For-research/Omics/Genome/Manihot esculenta/AM560_v7/Databases/Phytozome/Mesculenta_520_v7.1.protein.fa.gz" input
!cp "/content/drive/Shareddrives/BML/Data-resource/For-research/Omics/Genome/Manihot esculenta/AM560_v8/Genome_annotation/Database/Phytozome/Mesculenta_671_v8.1.annotation_info.txt" input
!cp "/content/drive/Shareddrives/BML/Data-resource/For-research/Omics/Genome/Manihot esculenta/AM560_v7/Databases/Phytozome/Mesculenta_520_v7.1.annotation_info.txt" input

!gunzip input/Mesculenta_671_v8.1.protein.fa.gz
!gunzip input/Mesculenta_520_v7.1.protein.fa.gz

In [5]:
# Define parameters for BLAST
num_core = 2
blast_type = 'prot'
blast = 'bin/blastp'

input_file_old = 'input/Mesculenta_520_v7.1.protein.fa'
input_file_new = 'input/Mesculenta_671_v8.1.protein.fa'

input_file_annotation_old = 'input/Mesculenta_520_v7.1.annotation_info.txt'
input_file_annotation_new = 'input/Mesculenta_671_v8.1.annotation_info.txt'

# 2. First comparison using BLAST
2.1 Run 1st BLAST

In [None]:
# Create database index sequence
!makeblastdb -in $input_file_new -dbtype $blast_type -out input/new -title new

# Runing BLAST
!$blast -db input/new \
        -query $input_file_old \
        -evalue 1e-10 \
        -max_target_seqs 1  \
        -num_threads $num_core  \
        -outfmt '6 std qcovs' \
        -out out_blastp_db_new__query_old.txt

In [6]:
# The result from 1st and 2nd BLAST 

!cp "/content/drive/Shareddrives/BML/Data-resource/For-research/Omics/Genome/Manihot esculenta/Link/MesV7_MesV8/out_blastp_db_new__query_old.txt" .
!cp "/content/drive/Shareddrives/BML/Data-resource/For-research/Omics/Genome/Manihot esculenta/Link/MesV7_MesV8/out_blastp_db_old__query_new.txt" .

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

first_BLASTp = pd.read_table('out_blastp_db_new__query_old.txt',
                             names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 
                                    'gapopen', 'qstart', 'qend', 'sstart', 'send', 
                                    'evalue', 'bitscore', 'qcovhsp'], 
                             header=None)

first_BLASTp['rank'] = first_BLASTp \
    .groupby('qseqid')['bitscore'] \
    .rank(ascending=False, method='dense')

first_BLASTp = first_BLASTp[first_BLASTp['rank'] == 1]
print('First BLAST result\nNumber of query sequences (old): %d\nNumber of subject sequences (new): %d' % 
      (len(first_BLASTp.qseqid.unique()), len(first_BLASTp.sseqid.unique())))

first_BLASTp

First BLAST result
Number of query sequences (old): 65166
Number of subject sequences (new): 35865


Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,rank
0,Manes.S022644.1.p,Manes.13G065500.1.p,100.000,110,0,0,1,110,347,456,1.410000e-72,223.0,100,1.0
1,Manes.S022744.1.p,Manes.13G065600.1.p,95.798,119,5,0,1,119,1,119,2.900000e-66,209.0,98,1.0
2,Manes.S002400.1.p,Manes.17G025599.1.p,100.000,512,0,0,1,512,1,512,0.000000e+00,1053.0,100,1.0
3,Manes.S003302.1.p,Manes.17G025997.1.p,100.000,510,0,0,1,510,1,510,0.000000e+00,1046.0,100,1.0
4,Manes.S003202.1.p,Manes.02G029101.1.p,32.476,311,121,6,8,250,178,467,2.970000e-40,147.0,96,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66826,Manes.S022480.1.p,Manes.S022415.1.p,100.000,394,0,0,1,394,1,394,0.000000e+00,809.0,100,1.0
66827,Manes.S022180.1.p,Manes.S022415.1.p,100.000,394,0,0,1,394,1,394,0.000000e+00,809.0,100,1.0
66828,Manes.S022780.1.p,Manes.S022415.1.p,100.000,394,0,0,1,394,1,394,0.000000e+00,809.0,100,1.0
66829,Manes.S022080.1.p,Manes.S022415.1.p,100.000,394,0,0,1,394,1,394,0.000000e+00,809.0,100,1.0


2.2 Filtering of 1st BLAST

In [8]:
# sns.jointplot(data=first_BLASTp, x='qcovhsp', y='pident')

fig = px.scatter(
    first_BLASTp,
    x='qcovhsp',
    y='pident',
    marginal_x='histogram',
    marginal_y='histogram',
    hover_data = {
        'qseqid': True,
        'sseqid': True,
    },
    width=500, 
    height=500)
fig.show()

In [9]:
first_BLASTp_filter_criteria = pd.DataFrame(columns = ['pident', 'qcovhsp', 'num_genes'])
for pident in range(0, 101, 5):
    for qcovhsp in range(0, 101, 5):
        first_BLASTp_filter_criteria = first_BLASTp_filter_criteria \
            .append({'pident': pident, 
                     'qcovhsp': qcovhsp,
                     'num_genes': first_BLASTp[(first_BLASTp.pident >= pident) & (first_BLASTp.qcovhsp >= qcovhsp)].shape[0]
                    }, ignore_index=True)

df = first_BLASTp_filter_criteria.pivot(index='pident', columns='qcovhsp', values='num_genes')
fig = px.imshow(df,
               labels=dict(x='> % Query coverage grather than', y='> % Identity grather than', color='Number of genes'),
               width=500, height=500)
fig.show()

In [10]:
first_BLASTp = first_BLASTp[(first_BLASTp.qcovhsp > 80) & (first_BLASTp.pident > 80)]
print('First BLAST result\nNumber of query sequences (new): %d\nNumber of subject sequences (old): %d' % (len(first_BLASTp.qseqid.unique()), len(first_BLASTp.sseqid.unique())))

First BLAST result
Number of query sequences (new): 63162
Number of subject sequences (old): 35320


# 3. Second comparison using BLAST

## 3.1 Runing BLAST

In [11]:
# Create database index sequence
!makeblastdb -in $input_file_old -dbtype $blast_type -out input/old -title old

# Runing BLAST
!$blast -db input/old \
        -query $input_file_new \
        -evalue 1e-10 \
        -max_target_seqs 1 \
        -num_threads $num_core  \
        -outfmt '6 std qcovs' \
        -out out_blastn_db_old__query_new.txt

/bin/bash: makeblastdb: command not found
BLAST Database error: No alias or index file found for protein database [input/old] in search path [/content::]


In [12]:
second_BLASTp = pd.read_table('out_blastp_db_old__query_new.txt', 
                             names=['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qcovhsp'], 
                             header=None)
second_BLASTp['rank'] = second_BLASTp \
    .groupby('qseqid')['bitscore'] \
    .rank(ascending=False, method='dense')

second_BLASTp = second_BLASTp[second_BLASTp['rank'] == 1]
print('Second BLAST result\nNumber of query sequences (new): %d\nNumber of subject sequences (old): %d' % (len(second_BLASTp.qseqid.unique()), len(second_BLASTp.sseqid.unique())))

second_BLASTp

Second BLAST result
Number of query sequences (new): 57878
Number of subject sequences (old): 35657


Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,qcovhsp,rank
0,Manes.14G015000.1.p,Manes.14G015000.1.p,100.000,794,0,0,1,794,1,794,0.000000e+00,1650.0,100,1.0
1,Manes.14G124300.1.p,Manes.14G124300.1.p,100.000,309,0,0,1,309,1,309,0.000000e+00,637.0,100,1.0
2,Manes.14G124300.3.p,Manes.14G124300.1.p,99.676,309,0,1,1,308,1,309,0.000000e+00,630.0,100,1.0
3,Manes.14G124300.4.p,Manes.14G124300.1.p,100.000,298,0,0,1,298,12,309,0.000000e+00,614.0,100,1.0
4,Manes.14G124300.2.p,Manes.14G124300.2.p,100.000,294,0,0,1,294,1,294,0.000000e+00,603.0,100,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58939,Manes.15G151200.1.p,Manes.15G151200.1.p,100.000,414,0,0,1,414,1,414,0.000000e+00,857.0,100,1.0
58940,Manes.15G133600.1.p,Manes.15G133600.1.p,100.000,176,0,0,1,176,1,176,3.240000e-130,362.0,100,1.0
58941,Manes.15G189800.1.p,Manes.15G189800.1.p,100.000,691,0,0,1,691,1,691,0.000000e+00,1424.0,100,1.0
58942,Manes.15G064500.1.p,Manes.15G064500.1.p,100.000,528,0,0,1,528,1,528,0.000000e+00,1103.0,100,1.0


## 3.2 Filtering BLAST result

In [13]:
# sns.jointplot(data=second_BLASTp, x='qcovhsp', y='pident')

fig = px.scatter(
    second_BLASTp,
    x='qcovhsp',
    y='pident',
    marginal_x='histogram',
    marginal_y='histogram',
    hover_data = {
        'qseqid': True,
        'sseqid': True,
    },
    width=500, 
    height=500)
fig.show()

In [14]:
second_BLASTp_filter_criteria = pd.DataFrame(columns = ['pident', 'qcovhsp', 'num_genes'])
for pident in range(0, 101, 5):
    for qcovhsp in range(0, 101, 5):
        second_BLASTp_filter_criteria = second_BLASTp_filter_criteria \
            .append({'pident': pident, 
                     'qcovhsp': qcovhsp,
                     'num_genes': second_BLASTp[(second_BLASTp.pident >= pident) & (second_BLASTp.qcovhsp >= qcovhsp)].shape[0]
                    }, ignore_index=True)

df = second_BLASTp_filter_criteria.pivot(index='pident', columns='qcovhsp', values='num_genes')
fig = px.imshow(df,
               labels=dict(x='% Query coverage grather than', y='> % Identity grather than', color='Number of genes'),
               width=500, 
               height=500)
fig.show()

In [15]:
second_BLASTp = second_BLASTp[(second_BLASTp.qcovhsp > 80) & (second_BLASTp.pident > 80)]
print('Second BLAST result\nNumber of query sequences (old): %d\nNumber of subject sequences (new): %d' % (len(second_BLASTp.qseqid.unique()), len(second_BLASTp.sseqid.unique())))

Second BLAST result
Number of query sequences (old): 55806
Number of subject sequences (new): 35072



# 4. Double best hit (DBH) filtering

In [16]:
link_protein_gene_new = pd.read_table(input_file_annotation_new) \
  .filter(['peptideName', 'locusName']) \
  .rename(columns={'peptideName': 'new_protein_id', 
                     'locusName': 'new_gene_id'})
link_protein_gene_old = pd.read_table(input_file_annotation_old) \
  .filter(['peptideName', 'locusName']) \
  .rename(columns={'peptideName': 'old_protein_id', 
                     'locusName': 'old_gene_id'})

link_protein_gene_new

Unnamed: 0,new_protein_id,new_gene_id
0,Manes.01G000031.1.p,Manes.01G000031
1,Manes.01G000031.2.p,Manes.01G000031
2,Manes.01G000031.3.p,Manes.01G000031
3,Manes.01G000062.1.p,Manes.01G000062
4,Manes.01G000093.1.p,Manes.01G000093
...,...,...
59146,Manes.S095508.1.p,Manes.S095508
59147,Manes.S095525.1.p,Manes.S095525
59148,Manes.S095625.1.p,Manes.S095625
59149,Manes.S095725.1.p,Manes.S095725


In [17]:
DBH_blast_result = \
  first_BLASTp \
    .filter(['qseqid', 'sseqid']) \
    .drop_duplicates() \
    .merge(second_BLASTp.filter(['qseqid', 'sseqid']).drop_duplicates(),
           how='inner',
           left_on='qseqid',
           right_on='sseqid',
           suffixes=['_1st_BLASTp', '_2nd_BLASTp']
          ) \
    .assign(DBH = lambda x: ((x.sseqid_1st_BLASTp == x.qseqid_2nd_BLASTp) & (x.qseqid_1st_BLASTp == x.sseqid_2nd_BLASTp))) \
    .sort_values('DBH', ascending=False) \
    .filter(['qseqid_1st_BLASTp', 'sseqid_1st_BLASTp', 'DBH']) \
    .rename(columns={'qseqid_1st_BLASTp': 'old_protein_id', 
                     'sseqid_1st_BLASTp': 'new_protein_id'}) \
    .merge(link_protein_gene_old, 
          how='inner', 
          left_on='old_protein_id', 
          right_on='old_protein_id') \
    .merge(link_protein_gene_new, 
          how='inner', 
          left_on='new_protein_id', 
          right_on='new_protein_id') \
    .assign(gene_match=lambda x: x.new_gene_id == x.old_gene_id)

In [18]:
DBH_blast_result.query('DBH == True')

Unnamed: 0,old_protein_id,new_protein_id,DBH,old_gene_id,new_gene_id,gene_match
0,Manes.S022580.1.p,Manes.S022415.1.p,True,Manes.S022580,Manes.S022415,False
87,Manes.14G030000.1.p,Manes.14G030000.1.p,True,Manes.14G030000,Manes.14G030000,True
88,Manes.14G002500.1.p,Manes.14G002500.1.p,True,Manes.14G002500,Manes.14G002500,True
89,Manes.14G052600.1.p,Manes.14G052600.1.p,True,Manes.14G052600,Manes.14G052600,True
91,Manes.14G143300.1.p,Manes.14G143300.1.p,True,Manes.14G143300,Manes.14G143300,True
...,...,...,...,...,...,...
81247,Manes.09G097500.2.p,Manes.09G097500.2.p,True,Manes.09G097500,Manes.09G097500,True
81248,Manes.11G042500.2.p,Manes.11G042500.2.p,True,Manes.11G042500,Manes.11G042500,True
81250,Manes.09G021700.1.p,Manes.09G021650.3.p,True,Manes.09G021700,Manes.09G021650,False
81251,Manes.08G066900.1.p,Manes.08G066900.1.p,True,Manes.08G066900,Manes.08G066900,True


In [19]:
final_DBH_blast_result = DBH_blast_result.query('DBH == True')
final_DBH_blast_result.to_csv('DBH_BLASTp_old_new.txt', sep='\t', index=False)

print('DBH BLAST result\nNumber of old gene ID: %d\nNumber of new gene ID: %d' % 
      (len(final_DBH_blast_result.old_gene_id.unique()), len(final_DBH_blast_result.new_gene_id.unique())))

DBH BLAST result
Number of old gene ID: 27889
Number of new gene ID: 27868


In [20]:
DBH = final_DBH_blast_result \
  .filter(['old_protein_id', 'new_protein_id']) \
  .drop_duplicates() \
  .assign(classify='DBH')
first = first_BLASTp \
  .rename(columns={'qseqid': 'old_protein_id', 'sseqid': 'new_protein_id'}) \
  .filter(['old_protein_id', 'new_protein_id']).drop_duplicates() \
  .assign(classify='FirstHit')
second = second_BLASTp \
  .rename(columns={'qseqid': 'new_protein_id', 'sseqid': 'old_protein_id'}) \
  .filter(['old_protein_id', 'new_protein_id']).drop_duplicates() \
  .assign(classify='SecondHit')
all = DBH \
  .append(first, ignore_index=True) \
  .append(second, ignore_index=True) \
  .assign(value = 1)
all

Unnamed: 0,old_protein_id,new_protein_id,classify,value
0,Manes.S022580.1.p,Manes.S022415.1.p,DBH,1
1,Manes.14G030000.1.p,Manes.14G030000.1.p,DBH,1
2,Manes.14G002500.1.p,Manes.14G002500.1.p,DBH,1
3,Manes.14G052600.1.p,Manes.14G052600.1.p,DBH,1
4,Manes.14G143300.1.p,Manes.14G143300.1.p,DBH,1
...,...,...,...,...
152991,Manes.15G044800.1.p,Manes.15G044800.1.p,SecondHit,1
152992,Manes.15G151200.1.p,Manes.15G151200.1.p,SecondHit,1
152993,Manes.15G133600.1.p,Manes.15G133600.1.p,SecondHit,1
152994,Manes.15G189800.1.p,Manes.15G189800.1.p,SecondHit,1


In [21]:
all = all \
  .pivot(
    index=['old_protein_id', 'new_protein_id'], 
    columns='classify', 
    values='value'
  ) \
  .reset_index() \
  .merge(link_protein_gene_old, 
          how='inner', 
          left_on='old_protein_id', 
          right_on='old_protein_id') \
  .merge(link_protein_gene_new, 
        how='inner', 
        left_on='new_protein_id', 
        right_on='new_protein_id') \
  .assign(gene_match=lambda x: x.new_gene_id == x.old_gene_id)

all.to_csv('DBH_BLASTp_old_new_2.txt', sep='\t', index=False)
all

Unnamed: 0,old_protein_id,new_protein_id,DBH,FirstHit,SecondHit,old_gene_id,new_gene_id,gene_match
0,Manes.01G000150.1.p,Manes.17G113600.1.p,,1.0,,Manes.01G000150,Manes.17G113600,False
1,Manes.01G000150.2.p,Manes.17G113600.1.p,,1.0,,Manes.01G000150,Manes.17G113600,False
2,Manes.17G113600.1.p,Manes.17G113600.1.p,1.0,1.0,1.0,Manes.17G113600,Manes.17G113600,True
3,Manes.01G000300.4.p,Manes.01G000300.4.p,1.0,1.0,1.0,Manes.01G000300,Manes.01G000300,True
4,Manes.01G000300.4.p,Manes.01G000300.6.p,,,1.0,Manes.01G000300,Manes.01G000300,True
...,...,...,...,...,...,...,...,...
84935,Manes.S111700.1.p,Manes.04G084851.1.p,1.0,1.0,1.0,Manes.S111700,Manes.04G084851,False
84936,Manes.S111702.1.p,Manes.10G108981.1.p,,,1.0,Manes.S111702,Manes.10G108981,False
84937,Manes.S112300.1.p,Manes.07G080422.1.p,,,1.0,Manes.S112300,Manes.07G080422,False
84938,Manes.S112300.1.p,Manes.07G080455.1.p,,,1.0,Manes.S112300,Manes.07G080455,False


In [22]:
# print('Number of orthologue proteins based on DBH: %d, FirstHit: %d, and SecondHit: %d' % (all['DBH'].sum(), all['FirstHit'].sum() - all['DBH'].sum(), all['SecondHit'].sum() - all['DBH'].sum()))

Number of orthologue proteins based on DBH: 34028, FirstHit: 29134, and SecondHit: 21778
