In [1]:
import pandas as pd
import numpy as np

In [2]:
df_pfam = pd.read_csv('pfam_annotations.csv', index_col=0, header=0, sep=",")

### Formatting data

In [3]:
df_pfam.reset_index(inplace=True)

In [4]:
#make new dataframe using just transcript and domain from df_pfam
df = df_pfam[['transcript', 'domain']]

In [5]:
df.head(10)

Unnamed: 0,transcript,domain
0,TRINITY_DN59156_c0_g1_i1.p1,Cysteine-rich secretory protein family
1,TRINITY_DN59156_c0_g1_i1.p2,Respiratory growth transcriptional regulator
2,TRINITY_DN95273_c0_g1_i1.p1,"NADH-ubiquinone/plastoquinone oxidoreductase, ..."
3,TRINITY_DN95273_c0_g1_i1.p1,"NADH-ubiquinone/plastoquinone oxidoreductase, ..."
4,TRINITY_DN95273_c0_g1_i1.p2,Borna disease virus G protein
5,TRINITY_DN69769_c0_g1_i1.p1,BTB/POZ domain
6,TRINITY_DN69769_c0_g1_i1.p1,BTB/POZ domain
7,TRINITY_DN69769_c0_g1_i1.p1,BTB/POZ domain
8,TRINITY_DN69769_c0_g1_i1.p1,BTB/POZ domain
9,TRINITY_DN69769_c0_g1_i1.p1,"Exonuclease VII, large subunit"


In [6]:
#Remove duplicate rows
df = df.drop_duplicates()

In [7]:
df.head(10)

Unnamed: 0,transcript,domain
0,TRINITY_DN59156_c0_g1_i1.p1,Cysteine-rich secretory protein family
1,TRINITY_DN59156_c0_g1_i1.p2,Respiratory growth transcriptional regulator
2,TRINITY_DN95273_c0_g1_i1.p1,"NADH-ubiquinone/plastoquinone oxidoreductase, ..."
4,TRINITY_DN95273_c0_g1_i1.p2,Borna disease virus G protein
5,TRINITY_DN69769_c0_g1_i1.p1,BTB/POZ domain
9,TRINITY_DN69769_c0_g1_i1.p1,"Exonuclease VII, large subunit"
10,TRINITY_DN69769_c0_g1_i1.p1,Mitochondrial calcium uniporter
11,TRINITY_DN69769_c0_g1_i1.p1,Domain of unknown function (DUF3552)
12,TRINITY_DN69769_c0_g1_i1.p1,Domain of unknown function (DUF4200)
13,TRINITY_DN69769_c0_g1_i1.p1,Autophagy protein Apg6


In [8]:
# Reset index of df
df = df.set_index('transcript')

In [9]:
# Save all the transcript names into a new dataframe
df_list = pd.DataFrame(df_pfam['transcript'])

In [10]:
df_list.head(10)

Unnamed: 0,transcript
0,TRINITY_DN59156_c0_g1_i1.p1
1,TRINITY_DN59156_c0_g1_i1.p2
2,TRINITY_DN95273_c0_g1_i1.p1
3,TRINITY_DN95273_c0_g1_i1.p1
4,TRINITY_DN95273_c0_g1_i1.p2
5,TRINITY_DN69769_c0_g1_i1.p1
6,TRINITY_DN69769_c0_g1_i1.p1
7,TRINITY_DN69769_c0_g1_i1.p1
8,TRINITY_DN69769_c0_g1_i1.p1
9,TRINITY_DN69769_c0_g1_i1.p1


In [11]:
df_list = df.drop_duplicates()

In [12]:
df_list.head(25)

Unnamed: 0_level_0,domain
transcript,Unnamed: 1_level_1
TRINITY_DN59156_c0_g1_i1.p1,Cysteine-rich secretory protein family
TRINITY_DN59156_c0_g1_i1.p2,Respiratory growth transcriptional regulator
TRINITY_DN95273_c0_g1_i1.p1,"NADH-ubiquinone/plastoquinone oxidoreductase, ..."
TRINITY_DN95273_c0_g1_i1.p2,Borna disease virus G protein
TRINITY_DN69769_c0_g1_i1.p1,BTB/POZ domain
TRINITY_DN69769_c0_g1_i1.p1,"Exonuclease VII, large subunit"
TRINITY_DN69769_c0_g1_i1.p1,Mitochondrial calcium uniporter
TRINITY_DN69769_c0_g1_i1.p1,Domain of unknown function (DUF3552)
TRINITY_DN69769_c0_g1_i1.p1,Domain of unknown function (DUF4200)
TRINITY_DN69769_c0_g1_i1.p1,Autophagy protein Apg6


In [None]:
# create new column to write concatenated domains to
df_list['new_domain'] = df_list.domain.astype(str) + '; '

In [14]:
df_list.head(10)

Unnamed: 0_level_0,domain,new_domain
transcript,Unnamed: 1_level_1,Unnamed: 2_level_1
TRINITY_DN59156_c0_g1_i1.p1,Cysteine-rich secretory protein family,Cysteine-rich secretory protein family;
TRINITY_DN59156_c0_g1_i1.p2,Respiratory growth transcriptional regulator,Respiratory growth transcriptional regulator;
TRINITY_DN95273_c0_g1_i1.p1,"NADH-ubiquinone/plastoquinone oxidoreductase, ...","NADH-ubiquinone/plastoquinone oxidoreductase, ..."
TRINITY_DN95273_c0_g1_i1.p2,Borna disease virus G protein,Borna disease virus G protein;
TRINITY_DN69769_c0_g1_i1.p1,BTB/POZ domain,BTB/POZ domain;
TRINITY_DN69769_c0_g1_i1.p1,"Exonuclease VII, large subunit","Exonuclease VII, large subunit;"
TRINITY_DN69769_c0_g1_i1.p1,Mitochondrial calcium uniporter,Mitochondrial calcium uniporter;
TRINITY_DN69769_c0_g1_i1.p1,Domain of unknown function (DUF3552),Domain of unknown function (DUF3552);
TRINITY_DN69769_c0_g1_i1.p1,Domain of unknown function (DUF4200),Domain of unknown function (DUF4200);
TRINITY_DN69769_c0_g1_i1.p1,Autophagy protein Apg6,Autophagy protein Apg6;


In [15]:
#groupby transcript and sum all domains
df_list_grouped = df_list.groupby('transcript')['new_domain'].sum()

In [16]:
df_list_grouped.iloc[5]

'Amiloride-sensitive sodium channel; Growth-arrest specific micro-tubule binding; Family of unknown function (DUF5344); Ribosomal protein L9, C-terminal domain; '

In [17]:
df_list_grouped.head(10)

transcript
TRINITY_DN103288_c0_g1_i1.p1        Glutathione S-transferase N-terminal domain; 
TRINITY_DN103717_c0_g2_i1.p1               Sigma-54 factor, core binding domain; 
TRINITY_DN124947_c0_g1_i1.p1              Protein of unknown function (DUF3105); 
TRINITY_DN125355_c0_g1_i1.p1                    VHL beta domain; VHL box domain; 
TRINITY_DN134774_c0_g1_i1.p1    Multicopper oxidase; Protein of unknown functi...
TRINITY_DN137920_c0_g2_i1.p1    Amiloride-sensitive sodium channel; Growth-arr...
TRINITY_DN139314_c0_g1_i1.p1    Hr1 repeat; Protein kinase C terminal domain; ...
TRINITY_DN140642_c0_g1_i1.p1                von Willebrand factor type C domain; 
TRINITY_DN142301_c1_g1_i2.p1     Centrosome microtubule-binding domain of Cep57; 
TRINITY_DN144106_c0_g1_i1.p1    Tail tubular protein; Down syndrome cell adhes...
Name: new_domain, dtype: object

In [18]:
# export to new csv file
df_list_grouped.to_csv('concatenated_list.csv', sep=',')