In [1]:
import pandas as pd
import pdb

In [2]:
# ENCSR291TRJ - h9_de
# ENCSR559HWG - h1_de

In [3]:
def convert_encode_desc(df, col):
    """
    Convert encode desc. of a sample into something more parseable
    """
    df[col] = df[col].str.lower()
    df[col] = df[col].str.replace(', ', '_')
    df[col] = df[col].str.replace(' ', '_')
    df[col] = df[col].str.replace('-', '_')
    
    return df

In [4]:
# short read -- LIMIT TO THINGS from v29
# https://www.encodeproject.org/matrix/?type=Experiment&control_type!=*&perturbed=false&assay_title=polyA+plus+RNA-seq&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&award.project=ENCODE&assembly=GRCh38&files.file_type=tsv&biosample_ontology.term_name!=middle+frontal+area+46&lab.title=Thomas+Gingeras%2C+CSHL&lab.title=Barbara+Wold%2C+Caltech&assay_title=total+RNA-seq&replicates.library.biosample.subcellular_fraction_term_name!=cytosol&replicates.library.biosample.subcellular_fraction_term_name!=nucleus&replicates.library.biosample.subcellular_fraction_term_name!=nucleolus&replicates.library.biosample.subcellular_fraction_term_name!=chromatin&replicates.library.biosample.subcellular_fraction_term_name!=nucleoplasm

# from narges: 
# Chose two illumina assays for now:
# Total RNA-seq
# polyA plus RNA-seq
# Only in human from ENCODE at GRCh38
# Only the ones built by Caltech
# Without any cellular component

In [5]:
df = pd.read_csv("metadata.tsv", sep='\t')
# df = pd.read_csv("metadata_221219.tsv", sep='\t')

df.index = df['File accession']
df = df.loc[df['Genome annotation'] == 'V29']
df = df[['File accession', 'Experiment accession', 'Biosample term name', 'Biosample type', 'Technical replicate(s)', 'Biological replicate(s)']]
df['classification'] = 'cell_line'
df.loc[df['Biosample type']=='tissue', 'classification'] = 'tissue'
df = convert_encode_desc(df, 'Biosample term name')


# rename biosamples for DE samples
# ENCSR291TRJ - h9_de
# ENCSR559HWG - h1_de
d = {'ENCSR291TRJ': 'h9_de', 'ENCSR559HWG':'h1_de', 'ENCSR266XAJ': 'h1_de'}
for key, item in d.items():
    df.loc[df['Experiment accession'] == key, 'Biosample term name'] = item

# convert hyphenated cell line names
term_map = pd.read_csv('../lr_bulk/biosamp_term_name_map.tsv', sep='\t',
                       header=None, names=['eid', 'old_name', 'idk1', 'idk2', 'new_name'])
term_map = convert_encode_desc(term_map, 'old_name')
term_map = convert_encode_desc(term_map, 'new_name')
term_map = term_map[['old_name', 'new_name']]
term_map.drop_duplicates(inplace=True)
n1 = len(df.index)
print(n1)
df = df.merge(term_map, how='left', left_on='Biosample term name', right_on='old_name')
n2 = len(df.index)
print(n2)
if n1 != n2:
    print('Duplicated thingies, check for DE samples')
df.rename({'Biosample term name': 'sample'}, axis=1, inplace=True)
df.loc[~df.new_name.isnull(), 'sample'] = df.loc[~df.new_name.isnull(), 'new_name']

# biorep
temp = df[['Experiment accession', 'sample', 'File accession']].groupby(['Experiment accession', 'sample']).count().reset_index()
temp['biorep'] = temp.groupby('sample').cumcount()+1
temp = temp[['Experiment accession', 'biorep']]
temp.biorep = temp.biorep.astype(str)
df = df.merge(temp, on='Experiment accession')

df['techrep'] = df.groupby('Experiment accession').cumcount()+1
df['hr'] = df['sample'] + '_'+ df['biorep'].astype(str) +'_'+ df.techrep.astype(str)
df = df[['File accession', 'classification', 'hr']]
df['sample'] = df.hr.str.rsplit('_', n=2, expand=True)[0]
df.to_csv("metadata_polyA_corrected.tsv", sep='\t', index=False)
# df.to_csv("metadata_polyA_corrected_221219.tsv", sep='\t', index=False)

548
548


In [6]:
df.head()

Unnamed: 0,File accession,classification,hr,sample
0,ENCFF058SJS,cell_line,wtc11_9_1,wtc11
1,ENCFF280QZQ,cell_line,wtc11_9_2,wtc11
2,ENCFF954SRP,cell_line,cd4_positive_alpha_beta_t_cell_2_1,cd4_positive_alpha_beta_t_cell
3,ENCFF489OCT,cell_line,activated_naive_cd4_positive_alpha_beta_t_cell...,activated_naive_cd4_positive_alpha_beta_t_cell
4,ENCFF613RIH,cell_line,activated_naive_cd8_positive_alpha_beta_t_cell...,activated_naive_cd8_positive_alpha_beta_t_cell


In [7]:
# df.loc[df.hr.duplicated(keep=False)].sort_values(by='hr')[['Experiment accession', 'Biological replicate(s)', 'techrep', 'hr']]

In [8]:
df['sample'].unique().tolist()

['wtc11',
 'cd4_positive_alpha_beta_t_cell',
 'activated_naive_cd4_positive_alpha_beta_t_cell',
 'activated_naive_cd8_positive_alpha_beta_t_cell',
 'huvec',
 'cd14_positive_monocyte',
 'naive_thymus_derived_cd4_positive_alpha_beta_t_cell',
 'cd4_positive_alpha_beta_memory_t_cell',
 'cd8_positive_alpha_beta_memory_t_cell',
 'brodmann_area_46',
 'naive_b_cell',
 'cd4_positive_cd25_positive_alpha_beta_regulatory_t_cell',
 'activated_t_cell',
 'immature_natural_killer_cell',
 'natural_killer_cell',
 'ocily7',
 'naive_thymus_derived_cd8_positive_alpha_beta_t_cell',
 'a549',
 'k562',
 'igd_negative_memory_b_cell',
 'cd8_positive_alpha_beta_t_cell',
 't_helper_17_cell',
 'activated_b_cell',
 'activated_cd4_positive_alpha_beta_t_cell',
 'a673',
 'activated_cd8_positive_alpha_beta_t_cell',
 'motor_neuron',
 'activated_cd8_positive_alpha_beta_memory_t_cell',
 'activated_cd4_positive_alpha_beta_memory_t_cell',
 't_cell',
 'placenta',
 'h1_de',
 'pc9',
 'hmec',
 'activated_t_helper_17_cell',
 'b_c

In [9]:
df.loc[df['File accession'].duplicated(keep=False)].sort_values(by='File accession')

Unnamed: 0,File accession,classification,hr,sample


In [74]:
# long read 
import swan_vis as swan
sg = swan.read('../lr_bulk/cerberus/swan/swan.p')

Read in graph from ../lr_bulk/cerberus/swan/swan.p


In [27]:
sg.adata.obs['sample'].unique().tolist()

['caco2',
 'brain',
 'mcf10a',
 'a673',
 'lung',
 'hl60_m0',
 'colon',
 'hmec',
 'k562',
 'imr90',
 'pgp1_astro',
 'h9_de',
 'liver',
 'pc9',
 'h9_panc_progen',
 'hl60_m2_24hr',
 'vessels',
 'h1',
 'h9_neural_crest',
 'ovary',
 'gm12878',
 'muscle',
 'hl60_m1_72hr',
 'wtc11',
 'huvec',
 'hl60_m1_12hr',
 'h1_de',
 'hl60_m2_72hr',
 'calu3',
 'mcf7',
 'heart',
 'pgp1',
 'hl60_m2_12hr',
 'pc3',
 'pgp1_endo',
 'ocily7',
 'adipose',
 'hl60_m1_24hr',
 'hffc6',
 'h9',
 'panc1',
 'h9_chondro',
 'pgp1_excite_neuron',
 'adrenal gland',
 'hl60',
 'h9_osteocyte',
 'h9_panc_beta',
 'hepg2',
 'hct116',
 'kidney']

In [75]:
sg.adata.obs.loc[sg.adata.obs['sample'] =='h1_de']

Unnamed: 0_level_0,dataset,total_counts,sample,classification,health_status
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
h1_de_1_2,h1_de_1_2,1019940.0,h1_de,cell_line,
h1_de_1_3,h1_de_1_3,976011.0,h1_de,cell_line,
h1_de_1_1,h1_de_1_1,820555.0,h1_de,cell_line,


In [76]:
sg.adata.obs.loc[sg.adata.obs['sample'] =='h9_de']

Unnamed: 0_level_0,dataset,total_counts,sample,classification,health_status
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
h9_de_1_2,h9_de_1_2,1894107.0,h9_de,cell_line,
h9_de_1_1,h9_de_1_1,2066530.0,h9_de,cell_line,
