In [None]:
import pandas as pd
import pandas as np
import sqlite3
import zlib
import csv
import os
import subprocess
from functools import reduce
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import AlignIO

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.ion()
import logomaker as lm


## 1. Open connection to SQNce

This script relies on a SQNce database that contains a protein_seqs table.\
For more information: https://github.com/eporetsky/SQNce \
I will upload my SQNce.db file that contains multiple plant genomes in the near future.\

In [None]:
con = sqlite3.connect('SQNce.db')

In [None]:
def proteins_select(con, entity_list):
        od = OrderedDict()
        for entity in entity_list:
            cursorObj = con.cursor()
            cursorObj.execute('''SELECT protein_id, protein_sequence
                                FROM protein_seqs
                                WHERE protein_id =  ?  ''', (entity,))
            # (name,) - need the comma to treat it as a single item and not list of letters
            selected = cursorObj.fetchall()
            if selected == []:
                continue
            else:
                selected = selected[0]
                od[selected[0]] = zlib.decompress(selected[1]).decode('utf-8')[:-1]
        return(od)
    
def proteins_write_fasta(con, entity_list):
    od = {}
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_id, protein_sequence
                            FROM protein_seqs
                            WHERE protein_id =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()
        if selected == []:
            continue
        else:
            
            selected = selected[0]
            name = selected[0]
            seq = Seq(zlib.decompress(selected[1]).decode('utf-8')[:-1].replace("*",""))
            od[name] = SeqRecord(seq=seq, id=name, description="")
    return(od)

## 2. Get list of candidate gemes

In [None]:
candidates = list(pd.read_csv("candidates.txt").iloc[:,0])

## 3. Get the Orthogroup dataframe

In [None]:
orth = pd.read_csv("Orthogroups.txt", sep="\t", index_col=0)

# Convert the ", "-separated values to a list of lists
# https://stackoverflow.com/questions/44557151/pandas-apply-map-to-every-element-of-every-column
orth = orth.applymap(lambda x: x.split(", ") if isinstance(x, str) else x)
orth.iloc[:3,:3]

## 4. Generate the orthogroup fasta and csv files

In [None]:
# The function find the orthogroup row that has the specified geneID
def gene_in_og(col, gene):
    if isinstance(col, list):
        return (True if gene in col else False)
    else:
        return(False)

In [None]:
# Currently redundant since the orthogroup fasta is part of the OrthoFinder output
# Use the SQNce.db sequences to generate the fasta files and sequence tables for each selected orthogroup
# Sequences not found in the database are skipped
for candidate in candidates:
    og = orth[orth["ZmB73v4"].apply(gene_in_og, args=[candidate])]
    print(og.index)
    assert len(og) <= 1, "Not expecting more than 1 orthogroups"
    # https://stackoverflow.com/questions/17485747/how-to-convert-a-nested-list-into-a-one-dimensional-list-in-python
    og = og.dropna(axis=1)
    og_list = reduce(lambda x,y: x+y, og.values.tolist()[0])
    
    # Get a dataframe of sequences
    og_df = pd.DataFrame.from_dict(proteins_select(con, og_list), orient="index").reset_index()
    og_df.columns = ["GeneID", "Seq"]
    og_df.to_csv("tables/"+candidate+".csv", index=False)
    od = proteins_write_fasta(con, og_list)
    with open("fasta/"+candidate+".fasta", "w") as handle:
            SeqIO.write(od.values(), handle, "fasta")

## 5. Run multiple sequence alignment using famsa

In [None]:
for fasta in os.listdir("fasta"):
    subprocess.run(["famsa "+f"fasta/{fasta} "+f"aln/{fasta.replace('fasta', 'aln')}", "arguments"], shell=True)

## 6. Generate sequence logo PDFs of all orthogroups using logomaker

In [None]:
for aln in os.listdir("aln"):
    seq_list = []
    align = AlignIO.read("aln/"+aln, "fasta")
    for seq in align:
        seq_list.append(str(seq.seq))
    fig, ax = plt.subplots(figsize=(len(seqs[0])*10, 10))

    ww_counts_df = lm.alignment_to_matrix(sequences=seq_list, to_type='counts', characters_to_ignore='.-X')
    logo = lm.Logo(ww_counts_df, ax=ax, color_scheme='chemistry')
    # weblogo_protein, skylign_protein, charge, chemistry
    fig.savefig("seqLogos/"+aln.replace("aln","pdf"))# png - make smaller images

## Optional: Merge all PDFs into one file

In [None]:
from PyPDF2 import PdfFileMerger

pdfs = os.listdir("seqLogos")
pdfs = ["seqLogos/"+pdf for pdf in pdfs]
merger = PdfFileMerger()

for pdf in pdfs:
    merger.append(pdf)
merger.write("seqLogos.pdf")
merger.close()


# SeqLogos for long sequences - alignmed spread across multiple lines
* Mostly works but will add documentation and example
* Need to adjust the final row to be of relative-length

# 1. A figure for a single alignment

In [None]:
aln = "gene_id"
seq_list = []
row_len=50
nrows=len(seq)//row_len if len(seq)%row_len == 0 else len(seq)//row_len+1
align = AlignIO.read("145aln/"+aln, "fasta")
fig, ax = plt.subplots(figsize=(11.69,0.5*nrows), nrows=nrows, ncols=1)#figsize=(len(seqs[0])*10, 10))
for row in range(nrows):
    seq_list = []
    for seq in align:
        row_end = row_len*(row+1)
        row_end = row_end if len(seq)>row_end else len(seq) 
        seq_list.append(str(seq.seq)[row_len*row:row_end]) #change 0 to n to get the C-terminus regiononly
    ww_counts_df = lm.alignment_to_matrix(sequences=seq_list, to_type='counts', characters_to_ignore='.-X')
    crop=False
    # Crop positions based on counts
    # filter base on counts (manually change True/False to apply cropping or not)
    if crop:
        num_seqs = ww_counts_df.sum(axis=1)
        pos_to_keep = num_seqs > 2 #len(seqs)/2
        ww_counts_df = ww_counts_df[pos_to_keep]
        ww_counts_df.reset_index(drop=True, inplace=True)

    try:
        logo = lm.Logo(ww_counts_df, ax=ax[row], color_scheme='chemistry')
    except:
        print(align, "failed.")
    # weblogo_protein, skylign_protein, charge, chemistry

    # The title is not text-searchable so not very helpful when merging all PDFs, but the margin is nice
    #ax.set_title("Sequence Logo")#aln.replace(".aln","")) 
    fig.text(0,0.5, aln.replace(".aln",""), size=8)    
fig.savefig(name+"/"+aln.replace("aln","pdf"), bbox_inches='tight')# png - make smaller images
plt.show()
plt.close()

# 2. A figure for multiple alignments

In [None]:
name = "project_name"
crop = True
if "cropped" in name:
    n = 60
else:
    n = 0
crop = False
    
folder_name = name+"aln"
os.system("mkdir " + name)
for aln in os.listdir(folder_name):
    seq_list = []
    row_len=50
    
    align = AlignIO.read(folder_name+"/"+aln, "fasta")
    seq_len = align.get_alignment_length()
    nrows=seq_len//row_len if seq_len%row_len == 0 else seq_len//row_len+1
    fig, ax = plt.subplots(figsize=(11.69,0.5*nrows), nrows=nrows, ncols=1)#figsize=(len(seqs[0])*10, 10))
    
    for row in range(nrows):
        seq_list = []
        for seq in align:
            row_end = row_len*(row+1)
            row_end = row_end if seq_len>row_end else seq_len
            seq_list.append(str(seq.seq)[row_len*row:row_end]) #change 0 to n to get the C-terminus regiononly
        ww_counts_df = lm.alignment_to_matrix(sequences=seq_list, to_type='counts', characters_to_ignore='.-X')
        
        crop=False
        # Crop positions based on counts
        # filter base on counts (manually change True/False to apply cropping or not)
        if crop:
            num_seqs = ww_counts_df.sum(axis=1)
            pos_to_keep = num_seqs > 2 #len(seqs)/2
            ww_counts_df = ww_counts_df[pos_to_keep]
            ww_counts_df.reset_index(drop=True, inplace=True)

        try:
            logo = lm.Logo(ww_counts_df, ax=ax[row], color_scheme='chemistry')
        except:
            print(align, "failed.")
        # weblogo_protein, skylign_protein, charge, chemistry

        # The title is not text-searchable so not very helpful when merging all PDFs, but the margin is nice
        #ax.set_title("Sequence Logo")#aln.replace(".aln","")) 
    fig.text(0,0.5, aln.replace(".aln",""), size=8)   
    fig.text(0,0.4, str(len(align))+" sequences", size=8)   
    fig.savefig(name+"/"+aln.replace("aln","pdf"), bbox_inches='tight')# png - make smaller images
    #plt.show()
    plt.close()

# Merge all the PDF figures
pdfs = os.listdir(name)
pdfs = [name+"/"+pdf for pdf in pdfs]
    
merger = PdfFileMerger()

for pdf in pdfs:
    merger.append(pdf)
merger.write(name+".pdf")
merger.close()