In [1]:
from Bio import SeqIO
import sklearn.metrics as skmetrics
from matplotlib import pyplot
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import pandas as pd
import numpy as np
import numpy as np
from decimal import Decimal




class FishGenes(object):
    def __init__(self, fish_id=None, fish_type=None, sequence=None,
                 fishes={'fishname': [], 'number': [], 'replace': []}):
        self.fish_id = fish_id
        self.fish_type = fish_type
        self.sequence = sequence
        self.fishes = {'fishname': [], 'number': [], 'replace': []}


def parse_fasta(reffilename, fish_type):
    Fish_List = []
    for rec in SeqIO.parse(reffilename, "fasta"):
        name, sequence = rec.id, str(rec.seq)
        Fish_List.append(FishGenes(fish_id=name, fish_type=fish_type, sequence=sequence, fishes={}))
    return Fish_List


def convert_files(reffilename, reffilename2):
    Fish_DeepWater = []
    Fish_ShalloWater = []

    Fish_DeepWater = parse_fasta(reffilename, fish_type='deepwater')
    Fish_ShalloWater = parse_fasta(reffilename2, fish_type='shallowater')
    return Fish_DeepWater, Fish_ShalloWater



def get_dictionary(Fish_DeepWater):
    final_dictionary=[]  
    for j in range(len(Fish_DeepWater[1].sequence)):
        fish_frequency_dictionary = {}
        for i in range(len(Fish_DeepWater)):
            if Fish_DeepWater[i].sequence[j] not in fish_frequency_dictionary:
                key = Fish_DeepWater[i].sequence[j]
                fish_frequency_dictionary[key] = 1/len(Fish_DeepWater)
            else:
                key = Fish_DeepWater[i].sequence[j]
                fish_frequency_dictionary[key] += 1/len(Fish_DeepWater)
        final_dictionary.append(fish_frequency_dictionary)
    return final_dictionary



def get_frequency(dict_deep,dict_shallow,sequence):
    final_sequence=[]
    for i in range(len(dict_deep)):
        final_dictionary={}
        for j in range(len(sequence)):
            key=sequence[j]
            if key  not in dict_deep[i]:
                dict_deep[i][key]=0
            if key not in dict_shallow[i]:
                dict_shallow[i][key]=0
            
            final_dictionary[key]=("{0:.4f}".format(abs(dict_deep[i][key]**2-dict_shallow[i][key]**2))).rstrip('0').rstrip('.')
        final_sequence.append(final_dictionary)
    return final_sequence




In [4]:

reffilename = '/Users/alena_paliakova/Google Drive/!Bioinf_drive/00_FishPr/02_Fishgenes_work/data/deepwater_al_tr.fasta'
reffilename2 = '/Users/alena_paliakova/Google Drive/!Bioinf_drive/00_FishPr/02_Fishgenes_work/data/shallowwater_al_tr.fasta'
df_combine = pd.DataFrame()
Fish_DeepWater, Fish_ShalloWater = convert_files(reffilename, reffilename2)
dictionary_Fish_DeepWater=get_dictionary(Fish_DeepWater)
dictionary_Fish_ShalloWater=get_dictionary(Fish_ShalloWater)
sequence='ACGT'
frequency=get_frequency(dictionary_Fish_DeepWater,dictionary_Fish_ShalloWater,sequence)

df=pd.DataFrame.from_dict(frequency)


df.to_csv('/Users/alena_paliakova/Google Drive/!Bioinf_drive/00_FishPr/02_Fishgenes_work/F_nucleotide_frequency.csv',
                        index=False)

print(df)

          A       C       G       T
0    0.0138  0.0008       0       0
1    0.0702       0       0       0
2    0.0008  0.0138       0       0
3         0       0  0.0008  0.0138
4         0       0  0.0008  0.0138
5         0  0.0939       0  0.0033
6    0.0013  0.1378       0       0
7         0       0       0  0.0702
8         0  0.3812  0.0196  0.0051
9    0.0702       0       0       0
10        0  0.0702       0       0
11   0.0061  0.1151  0.0022  0.0196
12        0  0.3253       0  0.0115
13        0       0       0  0.0702
14        0  0.4477  0.3471  0.0043
15        0  0.0033       0   0.111
16        0       0       0       0
17        0  0.2232       0  0.1339
18        0       0       0       0
19        0       0       0       0
20   0.0013  0.0097   0.002  0.0202
21        0       0       0       0
22        0       0       0       0
23        0       0       0       0
24   0.4227  0.0592  0.0115  0.0008
25        0       0       0       0
26        0  0.3712  0.0196 

In [None]:
%matplotlib inline
import seaborn
import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
from matplotlib import transforms
import matplotlib.patheffects
from matplotlib.font_manager import FontProperties
import matplotlib as mpl

import numpy as np

COLOR_SCHEME = {
                'A': 'red', 
                'G': 'orange', 
                'C': 'blue', 
                'T': 'darkgreen'}

BASES = list(COLOR_SCHEME.keys())


ALL_SCORES1 = [[('C', 0.02247014831444764),
              ('T', 0.057903843733384308),
              ('A', 0.10370837683591219),
              ('G', 0.24803586793255664)],
             [('T', 0.046608227674354567),
              ('G', 0.048827667087419063),
              ('A', 0.084338697696451109),
              ('C', 0.92994511407402669)],
             [('G', 0.0),
              ('T', 0.011098351287382456),
              ('A', 0.022196702574764911),
              ('C', 1.8164301607015951)],
             [('C', 0.020803153636453006),
              ('T', 0.078011826136698756),
              ('G', 0.11268374886412044),
              ('A', 0.65529933954826969)]
             ]




class Scale(matplotlib.patheffects.RendererBase):
    def __init__(self, sx, sy=None):
        self._sx = sx
        self._sy = sy

    def draw_path(self, renderer, gc, tpath, affine, rgbFace):
        affine = affine.identity().scale(self._sx, self._sy)+affine
        renderer.draw_path(gc, tpath, affine, rgbFace)

In [None]:
def draw_logo(all_scores, fontfamily='Arial', size=80):
    if fontfamily == 'xkcd':
        plt.xkcd()
    else:
        mpl.rcParams['font.family'] = fontfamily

    fig, ax = plt.subplots(figsize=(len(all_scores), 2.5))

    font = FontProperties()
    font.set_size(size)
    font.set_weight('bold')
    
    #font.set_family(fontfamily)

    ax.set_xticks(range(1,len(all_scores)+1))    
    ax.set_yticks(range(0,3))
    ax.set_xticklabels(range(1,len(all_scores)+1), rotation=90)
    ax.set_yticklabels(np.arange(0,3,1))    
    seaborn.despine(ax=ax, trim=True)
    
    trans_offset = transforms.offset_copy(ax.transData, 
                                          fig=fig, 
                                          x=1, 
                                          y=0, 
                                          units='dots')
   
    for index, scores in enumerate(all_scores):
        yshift = 0
        for base, score in scores:
            txt = ax.text(index+1, 
                          0, 
                          base, 
                          transform=trans_offset,
                          fontsize=80, 
                          color=COLOR_SCHEME[base],
                          ha='center',
                          fontproperties=font,

                         )
            txt.set_path_effects([Scale(1.0, score)])
            fig.canvas.draw()
            window_ext = txt.get_window_extent(txt._renderer)
            yshift = window_ext.height*score
            trans_offset = transforms.offset_copy(txt._transform, 
                                                  fig=fig,
                                                  y=yshift,
                                                  units='points')
        trans_offset = transforms.offset_copy(ax.transData, 
                                              fig=fig, 
                                              x=1, 
                                              y=0, 
                                              units='points')    
    plt.show()