In [None]:
import pandas as pd
import pandas as np
import sqlite3
import zlib
import csv
from functools import reduce
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

## 1. Open connection to SQNce

This script relies on a SQNce database that contains a protein_seqs table.\
For more information: https://github.com/eporetsky/SQNce \
I will upload my SQNce.db file that contains multiple plant genomes in the near future.\

In [None]:
con = sqlite3.connect('SQNce.db')

In [None]:
def proteins_select(con, entity_list):
        od = OrderedDict()
        for entity in entity_list:
            cursorObj = con.cursor()
            cursorObj.execute('''SELECT protein_id, protein_sequence
                                FROM protein_seqs
                                WHERE protein_id =  ?  ''', (entity,))
            # (name,) - need the comma to treat it as a single item and not list of letters
            selected = cursorObj.fetchall()
            if selected == []:
                continue
            else:
                selected = selected[0]
                od[selected[0]] = zlib.decompress(selected[1]).decode('utf-8')[:-1]
        return(od)
    
def proteins_write_fasta(con, entity_list):
    od = {}
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_id, protein_sequence
                            FROM protein_seqs
                            WHERE protein_id =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()
        if selected == []:
            continue
        else:
            
            selected = selected[0]
            name = selected[0]
            seq = Seq(zlib.decompress(selected[1]).decode('utf-8')[:-1].replace("*",""))
            od[name] = SeqRecord(seq=seq, id=name, description="")
    return(od)

## 2. Get list of candidate gemes

In [None]:
candidates = list(pd.read_csv("candidates.txt").iloc[:,0])

## 3. Get the Orthogroup dataframe

In [None]:
orth = pd.read_csv("Orthogroups.txt", sep="\t", index_col=0)

# Convert the ", "-separated values to a list of lists
# https://stackoverflow.com/questions/44557151/pandas-apply-map-to-every-element-of-every-column
orth = orth.applymap(lambda x: x.split(", ") if isinstance(x, str) else x)
orth.iloc[:3,:3]

## 4. Generate the orthogroup fasta and csv files

In [None]:
# The function find the orthogroup row that has the specified geneID
def gene_in_og(col, gene):
    if isinstance(col, list):
        return (True if gene in col else False)
    else:
        return(False)

In [None]:
# Currently redundant since the orthogroup fasta is part of the OrthoFinder output
# Use the SQNce.db sequences to generate the fasta files and sequence tables for each selected orthogroup
# Sequences not found in the database are skipped
for candidate in candidates:
    og = orth[orth["ZmB73v4"].apply(gene_in_og, args=[candidate])]
    print(og.index)
    assert len(og) <= 1, "Not expecting more than 1 orthogroups"
    # https://stackoverflow.com/questions/17485747/how-to-convert-a-nested-list-into-a-one-dimensional-list-in-python
    og = og.dropna(axis=1)
    og_list = reduce(lambda x,y: x+y, og.values.tolist()[0])
    
    # Get a dataframe of sequences
    og_df = pd.DataFrame.from_dict(proteins_select(con, og_list), orient="index").reset_index()
    og_df.columns = ["GeneID", "Seq"]
    og_df.to_csv("tables/"+candidate+".csv", index=False)
    od = proteins_write_fasta(con, og_list)
    with open("fasta/"+candidate+".fasta", "w") as handle:
            SeqIO.write(od.values(), handle, "fasta")