In [311]:
import pandas as pd
import numpy as np
from numbers import Number
from Bio.Align import AlignInfo, MultipleSeqAlignment
from Bio import AlignIO, Alphabet, SeqRecord, Seq, SubsMat
from datetime import datetime as dt

# Sequence Similarity Analysis
----
This notebook is a follow-up of the [sequence similarity demo](./20200706_seq_sim.ipynb). In this analysis, we will answer the question:

_How does the primary sequence of TMPRSS2 differ between species that one would encounter in a farm environment?_

It specifically provides a solution for the below analysis:

* Repeat analysis for each of the other domestic species (dog, horse, chicken, etc.)
* Generate a "generalized PSSM" for the other types of penalized polymorphisms, such as `acidic -> basic`, `bulky -> small`, `aromatic -> non-aromatic`, etc.

# Pseudocode summary
----

## Analysis Pipeline Overview

1. Read in all aligned sequences in FASTA file
2. Filter aligned sequences to only species of interest
3. **Optimal**: somehow remove proteins that are obviously not TMPRSS2
    - This would be some sort of function
4. Generate a mapping of comparisons, e.g. `[['human', 'mouse'], ['human', 'cattle'], ...]`
5. Get consensus sequence for species with more than 1 sequence, using function `get_consensus`
5. For each pair of sequences in the above mapping, call `compare_resi_props`
    - This function will contain much of the analysis in [sequence similarity demo](./20200706_seq_sim.ipynb), generalized for any two sequences. 
    - Returns some sort of data structure (probably a dataframe, we will call it `resi_prop_delta`) that shows change in each amino acid property (hydrophobicity, charge, etc.) for each site in the protein sequence.
6. Pass each `resi_prop_delta` to `get_struc_similarity`
    - This function will arbitrarily penalize changes in amino acids on a unitless scale
    - Returns a unitless structural similarity score (type float), where lower values indicate that a pair of homologs are functionally similar, and higher values indicate divergence in structure/function.
    
## `compare_resi_props` function
- Arguments
    - Two `Seq` instances
- Returns
    - Pandas dataframe `resi_prop_delta`: integer index at each site, each column is change in residue property for that site
    
## `get_struc_similarity` function
- Arguments
    - File path to `resi_prop_delta` CSV
- Returns
    - Float

# 1. Read in all aligned sequences in FASTA file
----

In [260]:
alignment = AlignIO.read(open('./trimmed_alg.txt'), format='fasta')
alignment

<<class 'Bio.Align.MultipleSeqAlignment'> instance (9757 records of length 60918, SingleLetterAlphabet()) at 7fed8a9f5590>

# 2. Filter aligned sequences to only species of interest
----

In [261]:
domestic_sp_names = [
    'Homo sapiens', # human
    'Mus musculus', # mouse
    'Canis lupus familiaris', # dog
    'Felis catus', # cat
    'Bos taurus', # cattle
    'Equus caballus', # horse
    'Gallus gallus' # chicken
]

In [262]:
tmprss2_ext = pd.read_table('../seq_sim_demo/extended_members.txt', header=None)
tmprss2_ext.columns = ['id_1', 'id_2', 'species', '', '']
tmprss2_ext.head()

Unnamed: 0,id_1,id_2,species,Unnamed: 4,Unnamed: 5
0,CRE24749,CRE24749,Caenorhabditis remanei,31234,"aliases:DS268562,E3N945_CAERE,E3N945,CRE_24749..."
1,CRE21132,CRE-TRY-4,Caenorhabditis remanei,31234,"aliases:E3MEX0,DS268440,E3MEX0_CAERE,CRE21132,..."
2,CRE24758,CRE-TRY-6,Caenorhabditis remanei,31234,"aliases:E3N963,DS268562,E3N963_CAERE,CRE24758,..."
3,CRE18672,CRE18672,Caenorhabditis remanei,31234,"aliases:DS268410,E3LKX4_CAERE,E3LKX4,CRE_18672..."
4,CRE24729,CRE-TRY-3,Caenorhabditis remanei,31234,"aliases:E3N418,DS268522,E3N418_CAERE,CRE24729,..."


In [263]:
for record in alignment:
    
    # while we're at it, let's make sure that Biopython knows these
    # are protein sequences
    record.seq.alphabet = Alphabet.generic_protein
    
    # from visual inspection we know the name format is XXXX.unique_id,
    # so we split on "." and take the last element of the list
    id_code = record.id.split('.')[-1]
    
    # reference the metadata to get the species name
    sp_name = tmprss2_ext[tmprss2_ext['id_1'] == id_code]['species'].values
    
    try:
        sp_name = sp_name.item()
    except ValueError:
        sp_name = None
    
    # assign the species name to the species attribute
    record.description = sp_name

In [264]:
dom_aln_list = [record for record in alignment
                if record.description in domestic_sp_names]

In [265]:
dom_aln = MultipleSeqAlignment(dom_aln_list)
dom_aln

<<class 'Bio.Align.MultipleSeqAlignment'> instance (732 records of length 60918, ProteinAlphabet()) at 7fed899918d0>

# 3. Filter proteins that are not TMPRSS2
----
WIP

# 4. Generate mapping of comparisons to make
----

In [266]:
comp_map = [
    ['Homo sapiens', 'Mus musculus'],
    ['Homo sapiens', 'Canis lupus familiaris'],
    ['Homo sapiens', 'Felis catus'],
    ['Homo sapiens', 'Bos taurus'],
    ['Homo sapiens', 'Equus caballus'],
    ['Homo sapiens', 'Gallus gallus' ]
]

# 5. Get the sequence of human isoform 2
----

Let's find the sequence record that has the same sequence as isoform 2 on the [TMPRSS2 UniProt page](https://www.uniprot.org/uniprot/O15393#O15393-1). The first few residues of this isoform are `MPPAPPGG`:

In [267]:
human_aln_list = [
    record for record in dom_aln
    if record.description == 'Homo sapiens'
]
human_aln = MultipleSeqAlignment(human_aln_list)

In [268]:
isoform_aln_list = [
    record for record in human_aln
    if 'MPPAPPGG' in str(record.seq).replace("-", "")
]

In [269]:
print("number of human sequences that contain MPPAPPGG:", len(isoform_aln_list))
human_iso2 = isoform_aln_list[0]
human_iso2

number of human sequences that contain MPPAPPGG: 1


SeqRecord(seq=Seq('------------------------------------------------------...---', ProteinAlphabet()), id='9606.ENSP00000381588', name='9606.ENSP00000381588', description='Homo sapiens', dbxrefs=[])

We also notice that most of the sequence of interest is in the middle of the aligned sequence. Let's trim the aligned sequence to generate a compact aligned sequence that it starts with `MPPAPP` and ends with `ADG`. To do this, we will make use of the [`str.index`](https://docs.python.org/2/library/stdtypes.html?highlight=index#str.index) method:

In [270]:
index_nterm = str(human_iso2.seq).index('MPPAPP')
index_cterm = str(human_iso2.seq).index('ADG')

# since we want to cut at ADG^, not ^ADG, we add 3 characters to this index
index_cterm += 3

print("index of N-terminus:", index_nterm)
print("index of C-terminus:", index_cterm)

index of N-terminus: 33713
index of C-terminus: 38856


We can use these indices to trim to the compact sequence:

In [271]:
human_compact = human_iso2[index_nterm:index_cterm]

# 6. An alternative to using consensus sequence
---
Consensus sequences are mostly dashes: how could we fix this?

Consensus sequence does a very bad job of choosing sequences that have much other than dashes. Instead, try calculating percent identity, and using the sequence out of a set that has best percent identity to the human reference sequence. The hypothesis here is that this will choose which of the sequences is "real" TMPRSS2.

In [316]:
def all_equal(items):
    """Returns True iff all items are equal."""
    first = items[0]
    return all(x == first for x in items)

def all_null(items):
    return all(x == '-' for x in items)

def percent_identity(aligned_sequences):
    """Returns the ratio of same-character columns in ``aligned_sequences``.

    :param aligned_sequences: a list of strings or equal length.
    """
    match_count = 0
    mismatch_count = 0
    for chars in zip(*aligned_sequences):
        # Here chars is a column of chars, 
        # one taken from each element of aligned_sequences.
        if all_null(chars):
            continue
        elif all_equal(chars):
            match_count += 1
        else:
            mismatch_count += 1
    # return float(match_count) / float(mismatch_count)
    # What would make more sense:
    return float(match_count) / len(aligned_sequences[0].replace('-', ''))

In [317]:
def get_best_match(msa, untrimmed_ref, trim):
    best_perc_id = 0.
    best_match = None
    for seq in msa:
        aligned_sequences = [
            str(human_iso2.seq)[trim],
            str(seq.seq)[trim]
        ]
        perc_id = percent_identity(aligned_sequences)
        if perc_id > best_perc_id:
            best_match = seq
            best_perc_id = perc_id
    print(f"assigning sequence with {100 * best_perc_id} percent identity as best")
    return best_match

In [318]:
best_cat = get_best_match(cat_aln, human_iso2, slice(index_nterm, index_cterm))
best_cat

assigning sequence with 27.22117202268431 percent identity as best


SeqRecord(seq=Seq('------------------------------------------------------...---', ProteinAlphabet()), id='9685.ENSFCAP00000010917', name='9685.ENSFCAP00000010917', description='Felis catus', dbxrefs=[])

In [275]:
def get_best_match_by_description(msa, untrimmed_ref, description, trim=slice(0, None)):
    """
    """
    assert isinstance(msa, MultipleSeqAlignment)
    
    filtered_msa_as_list = [
        record for record in msa
        if record.description == description
    ]
    filtered_msa = MultipleSeqAlignment(filtered_msa_as_list)
    return get_best_match(filtered_msa, untrimmed_ref, trim)

In [276]:
cat_best_match = get_best_match_by_description(
    cat_aln, human_iso2, 'Felis catus',
    trim=slice(index_nterm, index_cterm))
cat_best_match

assigning sequence with 2.799922224382656 percent identity as best


SeqRecord(seq=Seq('------------------------------------------------------...---', ProteinAlphabet()), id='9685.ENSFCAP00000010917', name='9685.ENSFCAP00000010917', description='Felis catus', dbxrefs=[])

# 6. Generate consensus sequences for cat homolog
----
Or rather, define functions that do this for us :)

In [277]:
def get_consensus(msa, trim=slice(0, None)):
    """Given a MultipleSequenceAlignment object,
    return a consensus sequence. Trim the consensus sequence
    using slice `trim`.
    """
    assert isinstance(msa, MultipleSeqAlignment)
    
    # generate SummaryInfo instance
    summary_info = AlignInfo.SummaryInfo(msa)
    
    # calculate consensus sequence
    consensus = summary_info.dumb_consensus(
        threshold=0.5, ambiguous='-',
        consensus_alpha=None, require_multiple=0)
    
    # replace X with - and trim of N and C terms
    replaced =  str(consensus).replace('X', '-')[trim]
    
    # some reporting: print number of dashes out of entire length of sequence
    num_dashes = replaced.count('-')
    print(f"consensus sequence has {num_dashes} null out of {len(replaced)}")

    # convert to a SeqRecord.SeqRecord instance
    compact = SeqRecord.SeqRecord(Seq.Seq(replaced))
    return compact

Example of how to use this function:

In [278]:
cat_aln_list = [
    record for record in dom_aln
    if record.description == 'Felis catus'
]
cat_aln = MultipleSeqAlignment(cat_aln_list)

In [279]:
cat_record_compact = get_consensus(cat_aln, slice(index_nterm, index_cterm))

consensus sequence has 4060 null out of 5143


Let's generalize further; given a species name, return the consensus sequence as a SequenceRecord:

In [280]:
def get_consensus_by_description(msa, description, trim=slice(0, None)):
    """
    """
    assert isinstance(msa, MultipleSeqAlignment)
    
    filtered_msa_as_list = [
        record for record in msa
        if record.description == description
    ]
    filtered_msa = MultipleSeqAlignment(filtered_msa_as_list)
    return get_consensus(filtered_msa, trim)

Using this function is even easier than using `get_consensus`:

In [281]:
cat_record = get_consensus_by_description(dom_aln, "Felis catus", slice(index_nterm, index_cterm))
cat_record

consensus sequence has 4060 null out of 5143


SeqRecord(seq=Seq('LSRMP-GL-----C-A-LA----G---A--L---SVALTL--A--P--------...---'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[])

# 6. Define functions that compare residues properties
---

6. For each pair of sequences in the above mapping, call `compare_resi_prop`
    - This function will contain much of the analysis in [sequence similarity demo](./20200706_seq_sim.ipynb), generalized for any two sequences. 
    - Returns some sort of data structure (probably a dataframe, we will call it `resi_prop_delta`) that shows change in each amino acid property (hydrophobicity, charge, etc.) for each site in the protein sequence.
7. Pass each `resi_prop_delta` to `get_struc_similarity`
    - This function will arbitrarily penalize changes in amino acids on a unitless scale
    - Returns a unitless structural similarity score (type float), where lower values indicate that a pair of homologs are functionally similar, and higher values indicate divergence in structure/function.

## Get table of amino acid properties

In [282]:
aa_props = pd.read_csv("../../data/amino_acid_properties.csv")
aa_props.set_index('single_letter', inplace=True)
aa_props

Unnamed: 0_level_0,full_name,pka,mass,occurrence,percent_buried,buried_vol,vdw,accessible,hydrophobicity
single_letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,alanine,,71.08,7.5,38.0,92.0,67.0,67.0,9.0
R,argenine,12.5,156.1,5.2,0.0,225.0,148.0,196.0,15.0
N,asparagine,,114.04,4.6,10.0,135.0,96.0,113.0,16.0
D,aspartic acid,3.9,115.08,5.2,14.5,125.0,91.0,106.0,19.0
C,cysteine,8.3,103.01,1.8,47.0,106.0,86.0,104.0,7.0
Q,glutamine,,128.06,4.1,6.3,161.0,114.0,144.0,17.0
E,glutamic acid,4.3,129.12,6.3,20.0,155.0,109.0,138.0,18.0
G,glycine,,57.05,7.1,37.0,66.0,48.0,,11.0
H,histidine,6.0,137.16,2.2,19.0,167.0,118.0,151.0,10.0
I,isoleucine,,113.16,5.5,65.0,169.0,124.0,140.0,1.0


## Define a function that determines if change in hydrophobicity is above a threshold
We arbitrarily set this threshold at 5.0 by default.

In [283]:
def is_change_in_hydrophobicity(resi1, resi2, min_diff=5.0):
    """This function takes string-type amino acid identifiers `resi1` and `resi2`
    and compares their hydrophobicities. If the absolute value of the difference
    between hydrophobicities is greater than `min_diff`, return boolean True.
    Otherwise, return boolean False.
    """
    print(f"comparing hydrophobicity between {resi1} and {resi2}")
    h1 = aa_props.loc[[resi1]]['hydrophobicity'].item()
    h2 = aa_props.loc[[resi2]]['hydrophobicity'].item()
    
    diff = abs(h1 - h2)
    print(f"the difference is hydrophobicity is {diff}")
    
    if diff > min_diff:
        return True
    else:
        return False    

## Define residues of interest
Note: this should be converted to a zero-indexed list for positions in human isoform 2. Most numberings in literature (and on the [UniProt page](https://www.uniprot.org/uniprot/O15393#ptm_processing)) use 1 indexed isoform 1.

In [284]:
# 0 indexed for isoform 2
# catalytic_triad = [332, 381, 471]
# binding_resi = [259, 260]

In [285]:
# 1 indexed for isoform 1
catalytic_triad = [296, 345, 441]
binding_resi = [435, 223, 224]
disulfide_bonds = [113, 126, 120, 139, 133,
                   148, 172, 231, 185, 241,
                   244, 365, 281, 297, 410,
                   426, 437, 465]
glycosylation_sites = [213, 249]

Let's convert the index to 0-indexed for isoform 2, simply by adding 36 to each index:

In [286]:
resi_interest_iso1 = catalytic_triad + binding_resi + disulfide_bonds
resi_interest = [idx + 36 for idx in resi_interest_iso1]
resi_interest

[332,
 381,
 477,
 471,
 259,
 260,
 149,
 162,
 156,
 175,
 169,
 184,
 208,
 267,
 221,
 277,
 280,
 401,
 317,
 333,
 446,
 462,
 473,
 501]

## Define the `compare_resi_prop` function
- Arguments
    - Two `Seq` instances
- Returns
    - Pandas dataframe `resi_prop_delta`: integer index at each site, each column is change in residue property for that site
    
## Define the `get_struc_similarity` function
- Arguments
    - File path to `resi_prop_delta` CSV
- Returns
    - Float

In [287]:
def parse_mutation(resi1, resi2, **columns):
    """Given two amino acids `resi1` and `resi2`, return a dictionary of
    change in amino acid properties between these residues. Use single-letter
    amino acid codes. Returns dict.
    """
    assert isinstance(resi1, str)
    assert len(resi1) == 1
    assert isinstance(resi2, str)
    assert len(resi2) == 1
    
    diff = {
        'resi_reference': resi1,
        'resi_compare': resi2,
    }
    
    if resi2 == '-':
        diff['is_deletion'] = True
        columns.update(diff)
        return columns
    else:
        diff['is_deletion'] = False
    
    dict1 = aa_props.loc[resi1].to_dict()
    dict2 = aa_props.loc[resi2].to_dict()
    
    for k in dict1:
        if not isinstance(dict1[k], Number):
            continue
        diff[f"{k}_delta"] = dict2[k] - dict1[k]
    
    columns.update(diff)
    return columns

Let's test this function:

In [288]:
parse_mutation('Y', 'S')

{'resi_reference': 'Y',
 'resi_compare': 'S',
 'is_deletion': False,
 'pka_delta': nan,
 'mass_delta': -75.98,
 'occurrence_delta': 4.1000000000000005,
 'percent_buried_delta': 11.0,
 'buried_vol_delta': -104.0,
 'vdw_delta': -68.0,
 'accessible_delta': -107.0,
 'hydrophobicity_delta': 6.0}

In [289]:
def compare_resi_prop(seq1, seq2, resi_interest=None):
    """
    """
    # make sure both of the arguments are SeqRecords
    assert isinstance(seq1, SeqRecord.SeqRecord)
    assert isinstance(seq2, SeqRecord.SeqRecord)
    
    # we want to keep track of which amino acid our
    # "cursor" is on in the for loop
    position_counter = 0

    # get the entire list of positions in the human sequence as
    # integers. We include dashes in this calculation
    list_of_positions_including_dashes = range(len(seq1))
    
    # List of rows (as dicts) that will be converted to a dataframe
    rows_list = list()

    for position_with_dashes in list_of_positions_including_dashes:
        
        # get the amino acid at this position (dashes included)
        # in both human and cat
        resi_in_1 = seq1[position_with_dashes]
        resi_in_2 = seq2[position_with_dashes]

        # skip this position if it is a '-'
        # in the seq1 record
        if resi_in_1 == '-':
            continue
            
        # detect if we are at an important amino acid
        is_interesting = bool(position_counter in resi_interest)
        
        # analyze the mutation using a different function
        row_as_dict = parse_mutation(resi_in_1, resi_in_2,
                                     iso2_zero_idx=position_counter,
                                     is_interesting=is_interesting)
        rows_list.append(row_as_dict)
        
        position_counter += 1
    
    # Convert the list of dicts to a pandas dataframe
    df = pd.DataFrame(rows_list)
    
    # Back-calculate what the position would be in isoform 1
    # Recall from the seq sim demo that the offset to account for
    # isoform 2 and zero indexing is 36
    df['iso1_one_idx'] = df['iso2_zero_idx'] - 36
    
    # Manually account for negative values
    df.loc[df['iso1_one_idx'] <= 1, 'iso1_one_idx'] = np.nan
    
    # ...but remember that iso2_zero_idx == is actually the first residue...
    df.loc[0, 'iso1_one_idx'] = 1
    
    return df

We will test this function below:

In [290]:
resi_prop_delta = compare_resi_prop(human_compact, cat_record, resi_interest=resi_interest)
resi_prop_delta #[~resi_prop_delta['iso1_one_idx'].isna()]

Unnamed: 0,iso2_zero_idx,is_interesting,resi_reference,resi_compare,is_deletion,pka_delta,mass_delta,occurrence_delta,percent_buried_delta,buried_vol_delta,vdw_delta,accessible_delta,hydrophobicity_delta,iso1_one_idx
0,0,False,M,L,False,,-18.04,6.3,-9.0,-3.0,0.0,-23.0,-2.0,1.0
1,1,False,P,S,False,,-10.04,2.3,0.0,-30.0,-17.0,-25.0,1.0,
2,2,False,P,R,False,,58.98,0.1,-24.0,96.0,58.0,91.0,2.0,
3,3,False,A,M,False,,60.12,-4.7,12.0,79.0,57.0,93.0,-4.0,
4,4,False,P,P,False,,0.00,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524,524,False,M,-,True,,,,,,,,,488.0
525,525,False,R,-,True,,,,,,,,,489.0
526,526,False,A,-,True,,,,,,,,,490.0
527,527,False,D,-,True,,,,,,,,,491.0


Write this dataframe to the `data` folder:

In [291]:
!ls ../../data/proc

resi_prop_delta_Bos_taurus.csv		    resi_prop_delta_Gallus_gallus.csv
resi_prop_delta_Canis_lupus_familiaris.csv  resi_prop_delta_Homo_sapiens.csv
resi_prop_delta.csv			    resi_prop_delta_Mus_musculus.csv
resi_prop_delta_Equus_caballus.csv	    resi_prop_delta_summary.csv
resi_prop_delta_Felis_catus.csv


In [306]:
resi_prop_delta_fp = "../../data/proc/resi_prop_delta_test.csv"
resi_prop_delta.to_csv(resi_prop_delta_fp, index=False)

# 7. Putting it all together
----

In [293]:
def compare_species(msa, reference_seq, comparison_species_name, resi_interest, seq_trimming):
    """
    """
    assert isinstance(msa, MultipleSeqAlignment)
    assert isinstance(comparison_species_name, str)
    assert isinstance(resi_interest, list)
    assert isinstance(seq_trimming, slice)

    comparison_best_match = get_best_match_by_description(msa, reference_seq, comparison_species_name, seq_trimming)
    return compare_resi_prop(reference_seq, comparison_best_match, resi_interest=resi_interest)

We can repeat the above comparison between human and cat in one line:

In [294]:
df = compare_species(dom_aln, human_iso2, 'Felis catus', resi_interest, slice(index_nterm, index_cterm))

assigning sequence with 2.799922224382656 percent identity as best


In [295]:
df[df['iso1_one_idx'].isin(disulfide_bonds)]

Unnamed: 0,iso2_zero_idx,is_interesting,resi_reference,resi_compare,is_deletion,pka_delta,mass_delta,occurrence_delta,percent_buried_delta,buried_vol_delta,vdw_delta,accessible_delta,hydrophobicity_delta,iso1_one_idx
149,149,True,C,H,False,-2.3,34.15,0.4,-28.0,61.0,32.0,47.0,3.0,113.0
156,156,True,C,P,False,,-5.89,3.3,-23.0,23.0,4.0,1.0,6.0,120.0
162,162,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126.0
169,169,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133.0
175,175,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,139.0
184,184,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,148.0
208,208,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,172.0
221,221,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,185.0
267,267,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,231.0
277,277,True,C,C,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,241.0


## Run for every species that is not human
----

In [319]:
for species_name in domestic_sp_names:
    print(f"Comparing species {species_name} to reference 'Homo sapiens'")
    compare_df = compare_species(dom_aln, human_iso2, species_name,
                                 resi_interest, slice(index_nterm, index_cterm))
    species_name_no_space = species_name.replace(" ", "_")
    compare_fp = f"../../data/proc/resi_prop_delta_{species_name_no_space}.csv"
    
    header = str(
        "# Dataframe containing comparison between human TMPRSS2\n" +
        "# isoform 2 and the sequence in aligned EggNOG sequences\n" +
        f"# that had highest % identity with the human homolog, for species {species_name}\n" +
        "# Author: Ethan Ho\n" +
        f"# Date: {dt.now()}\n" 
        "# From: 20200723_seq_sim.ipynb\n" 
    )
    
    print(f"Writing comparison dataframe to {compare_fp}...")
    with open(compare_fp, 'w') as f:
        f.write(header)
        compare_df.to_csv(f, index=False)

Comparing species Homo sapiens to reference 'Homo sapiens'
assigning sequence with 100.0 percent identity as best
Writing comparison dataframe to ../../data/proc/resi_prop_delta_Homo_sapiens.csv...
Comparing species Mus musculus to reference 'Homo sapiens'
assigning sequence with 72.77882797731569 percent identity as best
Writing comparison dataframe to ../../data/proc/resi_prop_delta_Mus_musculus.csv...
Comparing species Canis lupus familiaris to reference 'Homo sapiens'
assigning sequence with 26.65406427221172 percent identity as best
Writing comparison dataframe to ../../data/proc/resi_prop_delta_Canis_lupus_familiaris.csv...
Comparing species Felis catus to reference 'Homo sapiens'
assigning sequence with 27.22117202268431 percent identity as best
Writing comparison dataframe to ../../data/proc/resi_prop_delta_Felis_catus.csv...
Comparing species Bos taurus to reference 'Homo sapiens'
assigning sequence with 72.77882797731569 percent identity as best
Writing comparison dataframe t

# 8. Let's take a look at these dataframes...
----
Sanity checking and such. Also, let's narrow down the data to only residues of interest and write to a `summary` dataframe.

In [297]:
domestic_sp_names

['Homo sapiens',
 'Mus musculus',
 'Canis lupus familiaris',
 'Felis catus',
 'Bos taurus',
 'Equus caballus',
 'Gallus gallus']

In [320]:
compare_df_lst = list()
for species_name in domestic_sp_names:
    species_name_no_space = species_name.replace(" ", "_")
    compare_fp = f"../../data/proc/resi_prop_delta_{species_name_no_space}.csv"
    filtered_df = pd.read_csv(compare_fp, comment='#')
    filtered_df = filtered_df[filtered_df.is_interesting]
    filtered_df['species'] = species_name
    compare_df_lst.append(filtered_df)
compare_df_summary = pd.concat(compare_df_lst)
compare_df_summary.to_csv("../../data/proc/resi_prop_delta_summary.csv")

Then, we can narrow down to residues of interest that changed for each species:

In [321]:
mutation_summary_df = compare_df_summary[compare_df_summary.resi_reference != compare_df_summary.resi_compare]
mutation_summary_df

Unnamed: 0,iso2_zero_idx,is_interesting,resi_reference,resi_compare,is_deletion,pka_delta,mass_delta,occurrence_delta,percent_buried_delta,buried_vol_delta,vdw_delta,accessible_delta,hydrophobicity_delta,iso1_one_idx,species
149,149,True,C,H,False,-2.3,34.15,0.4,-28.0,61.0,32.0,47.0,3.0,113.0,Canis lupus familiaris
156,156,True,C,P,False,,-5.89,3.3,-23.0,23.0,4.0,1.0,6.0,120.0,Canis lupus familiaris
259,259,True,K,-,True,,,,,,,,,223.0,Canis lupus familiaris
260,260,True,K,S,False,,-41.09,1.6,19.8,-72.0,-62.0,-87.0,-6.0,224.0,Canis lupus familiaris
149,149,True,C,H,False,-2.3,34.15,0.4,-28.0,61.0,32.0,47.0,3.0,113.0,Felis catus
156,156,True,C,P,False,,-5.89,3.3,-23.0,23.0,4.0,1.0,6.0,120.0,Felis catus
259,259,True,K,-,True,,,,,,,,,223.0,Felis catus
260,260,True,K,S,False,,-41.09,1.6,19.8,-72.0,-62.0,-87.0,-6.0,224.0,Felis catus
260,260,True,K,R,False,1.7,27.93,-0.6,-4.2,54.0,13.0,29.0,-5.0,224.0,Gallus gallus


In [323]:
mutation_summary_header = str(
    "# Dataframe containing comparison between human TMPRSS2\n" +
    "# isoform 2 to other species' homologs.\n" +
    "# Filtered to only sequences of interest that are different from the human sequence\n" +
    "# Author: Ethan Ho\n" +
    f"# Date: {dt.now()}\n" 
    "# From: 20200723_seq_sim.ipynb\n" 
)

mutation_summary_fp = "../../data/proc/resi_prop_interesting_mutations.csv"
print(f"Writing comparison dataframe to {mutation_summary_fp}...")
with open(mutation_summary_fp, 'w') as f:
    f.write(mutation_summary_header)
    mutation_summary_df.to_csv(f, index=False)

Writing comparison dataframe to ../../data/proc/resi_prop_interesting_mutations.csv...
