In [6]:
import os
import sys

from tqdm import tqdm

from Bio.Seq import Seq
import gzip
from twobitreader import TwoBitFile
import pandas as pd
from collections import defaultdict
import numpy as np

def mapper(x):
    try:
        return mapping[x]
    except:
        return None

# BRCA

### Load Acetylomics Input

In [16]:
INPUT = './inputs/acetylomics/brca_acetyl_binary.csv'
df = pd.read_csv(INPUT, index_col=0)
df.head()

Unnamed: 0,X11BR047,X11BR043,X11BR049,X11BR023,X18BR010,X06BR003,X11BR074,X18BR017,X01BR017,X06BR006,...,X09BR001,X03BR011,X11BR036.REP2,X01BR010,invented_patient,accession_number,geneSymbol,variableSites,accession_numbers,site_position
NP_000468.1_K28k _1_1_28_28,1,0,0,1,1,1,0,0,0,1,...,1,1,1,1,1,NP_000468.1,ALB,K28k,NP_000468.1,28
NP_000468.1_K36k _1_1_36_36,1,0,0,0,1,1,0,0,0,1,...,1,1,1,1,1,NP_000468.1,ALB,K36k,NP_000468.1,36
NP_000468.1_K44k _1_1_44_44,1,0,1,1,1,1,1,0,0,1,...,1,1,1,1,1,NP_000468.1,ALB,K44k,NP_000468.1,44
NP_000468.1_K65k _1_1_65_65,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,NP_000468.1,ALB,K65k,NP_000468.1,65
NP_000468.1_K75k _1_1_75_75,1,0,0,1,1,1,0,1,0,0,...,1,1,1,1,1,NP_000468.1,ALB,K75k,NP_000468.1,75


### Add Uniprot IDs

In [12]:
map_df = pd.read_csv('./acetylomics_inputs/accession_to_uniprot.tsv',sep='\t')
mapping = map_df.set_index('accession_number').to_dict()['uniprot_id']

df['uniprot_id'] = df['accession_number'].apply(lambda x: mapper(x))
df = df.dropna()
df.shape

(10967, 160)

### Fitler by Patients and Melt Dataframe

In [13]:
pats = [x for x in list(df) if x.startswith('X')] + ['RetroIR','CPT001846','CPT000814','RetroIR.REP1','invented_patient']
input_df = df.melt(id_vars=[m for m in list(df) if m not in pats]).set_index(['variable','uniprot_id','site_position']).reset_index().rename(columns={'variable':'patient'})

In [18]:
input_df.head()

Unnamed: 0,patient,uniprot_id,site_position,accession_number,geneSymbol,variableSites,accession_numbers,value
0,X11BR047,P02768,28,NP_000468.1,ALB,K28k,NP_000468.1,1
1,X11BR047,P02768,36,NP_000468.1,ALB,K36k,NP_000468.1,1
2,X11BR047,P02768,44,NP_000468.1,ALB,K44k,NP_000468.1,1
3,X11BR047,P02768,65,NP_000468.1,ALB,K65k,NP_000468.1,0
4,X11BR047,P02768,75,NP_000468.1,ALB,K75k,NP_000468.1,1


In [14]:
input_df.to_csv('./inputs/acetylomics/brca.tsv',sep='\t')

# LUAD

In [21]:
INPUT = './inputs/acetylomics/luad_clumps_acetyl_binary.csv'
df = pd.read_csv(INPUT, index_col=0)
df.head()

Unnamed: 0,C3N.01799,C3N.01799.N,C3L.01890,C3L.01890.N,C3N.00572,C3N.00572.N,C3N.02423,C3N.02423.N,Tumor.Only.IR,C3N.02729,...,C3L.02549.N,C3L.02365,C3L.02365.N,Normal.Only.IR.3,invented_patient,accession_number,geneSymbol,variableSites,accession_numbers,site_position
NP_000468.1_K28k _1_1_28_28,1,1,1,1,0,0,1,1,0,0,...,0,0,0,1,1,NP_000468.1,ALB,K28k,NP_000468.1,28
NP_000468.1_K36k _1_1_36_36,0,1,1,1,0,0,1,1,0,0,...,1,0,1,1,1,NP_000468.1,ALB,K36k,NP_000468.1,36
NP_000468.1_K44k _1_1_44_44,0,1,1,1,1,0,1,1,0,1,...,1,1,1,0,1,NP_000468.1,ALB,K44k,NP_000468.1,44
NP_000468.1_K65k _1_1_65_65,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,NP_000468.1,ALB,K65k,NP_000468.1,65
NP_000468.1_K75k _1_1_75_75,0,1,1,1,1,1,0,1,0,0,...,1,0,1,1,1,NP_000468.1,ALB,K75k,NP_000468.1,75


In [22]:
map_df = pd.read_csv('./acetylomics_inputs/accession_to_uniprot.tsv',sep='\t')
mapping = map_df.set_index('accession_number').to_dict()['uniprot_id']

df['uniprot_id'] = df['accession_number'].apply(lambda x: mapper(x))
df = df.dropna()
df.shape

(7829, 232)

In [27]:
pats = list(df)[:-6]
input_df = df.melt(id_vars=[m for m in list(df) if m not in pats]).set_index(['variable','uniprot_id','site_position']).reset_index().rename(columns={'variable':'patient'})

In [29]:
input_df.to_csv('./inputs/acetylomics/luad.tsv',sep='\t')

### PAN

In [94]:
INPUT = './inputs/acetylomics/pan_ac_clumps_binary.csv'
df = pd.read_csv(INPUT, index_col=0)
df.head()

before = set(df['accession_number'])

In [95]:
map_df = pd.read_csv('./acetylomics_inputs/accession_to_uniprot.tsv',sep='\t')
mapping = map_df.set_index('accession_number').to_dict()['uniprot_id']

df['uniprot_id'] = df['accession_number'].apply(lambda x: mapper(x))
df = df.dropna()
df.shape

(14045, 573)

* BRCA samples - 1:153
* LUAD samples - 154:378
* Medul samples - 379:423
* UCEC samples - 424: 567

In [96]:
ttypes = ['BRCA'] * 153 + ['LUAD']*225 + ['MEDUL']*45 + ['UCEC']*144 + ['ALL']

In [97]:
pats = list(df)[:-5]
input_df = df.melt(id_vars=[m for m in list(df) if m not in pats]).set_index(['variable','uniprot_id','site_position']).reset_index().rename(columns={'variable':'patient'})

In [98]:
ttype_mapping = dict(zip(pats, ttypes))

In [99]:
input_df['ttype'] = input_df['patient'].apply(lambda x: ttype_mapping[x])
input_df.head()

Unnamed: 0,patient,uniprot_id,site_position,GeneSymbol,accession_number,variableSites,value,ttype
0,X11BR047,P01023,1019,A2M,NP_000005.2,K1019k,0,BRCA
1,X11BR047,P01023,1047,A2M,NP_000005.2,K1047k,0,BRCA
2,X11BR047,P01023,1092,A2M,NP_000005.2,K1092k,0,BRCA
3,X11BR047,P01023,115,A2M,NP_000005.2,K115k,0,BRCA
4,X11BR047,P01023,1162,A2M,NP_000005.2,K1162k,0,BRCA


In [101]:
input_df.to_csv('./inputs/acetylomics/pan.tsv',sep='\t')