In [19]:
import pandas as pd
import pdb
import os
import sys

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [20]:
def convert_encode_desc(df, col):
    """
    Convert encode desc. of a sample into something more parseable
    """
    df[col] = df[col].str.lower()
    df[col] = df[col].str.replace(', ', '_')
    df[col] = df[col].str.replace(' ', '_')
    df[col] = df[col].str.replace('-', '_')
    
    return df

In [21]:
def convert_biosample_treatments(df, col):
# turn biosample treatments into tuples
    df.loc[df[col].isnull(), col] = ''
    df[col] = df.apply(lambda x: tuple(sorted(x[col].replace(' ', '').split(','))), axis=1)
    return df

In [22]:
df = pd.read_csv("metadata.tsv", sep='\t')

df.index = df['File accession']
df = df.loc[df['Genome annotation'] == 'V29']
df = df[['File accession', 'Experiment accession', \
         'Biosample term name', 'Biosample type', \
         'Technical replicate(s)', 'Biological replicate(s)',
         'Biosample treatments', 'Biosample treatments duration']]
df['classification'] = 'cell_line'
df.loc[df['Biosample type']=='tissue', 'classification'] = 'tissue'
df = convert_encode_desc(df, 'Biosample term name')




In [23]:
# convert hyphenated cell line names
term_map = pd.read_csv('../lr_bulk/biosamp_term_name_map.tsv', sep='\t',
                       header=None, names=['eid', 'old_name', \
                                           'Biosample treatments', \
                                           'Biosample treatments duration', 'new_name'])
term_map.drop('eid', axis=1, inplace=True)
term_map = convert_encode_desc(term_map, 'old_name')
term_map = convert_encode_desc(term_map, 'new_name')
term_map.drop_duplicates(inplace=True)

# rename biosamples for DE samples
# ENCSR228XLW - h9_de
# ENCSR820BMF - h1_de
d = {'ENCSR820BMF': 'h1_de', 'ENCSR228XLW': 'h9_de'}
for key, item in d.items():
    df.loc[df['Experiment accession'] == key, 'Biosample term name'] = item


# turn biosample treatments into tuples
df = convert_biosample_treatments(df, 'Biosample treatments')
term_map = convert_biosample_treatments(term_map, 'Biosample treatments')


# df['Biosample treatments'].str.split(',').astype(tuple).sort_values()


n1 = len(df.index)
print(n1)
df_merge_cols = ['Biosample term name', 'Biosample treatments', 'Biosample treatments duration']
term_map_merge_cols = ['old_name', 'Biosample treatments', 'Biosample treatments duration']

# merge didn't work 
df = df.merge(term_map, how='left', left_on=df_merge_cols, right_on=term_map_merge_cols)
n2 = len(df.index)
print(n2)
if n1 != n2:
    print('Duplicated thingies, check for DE samples')
df.rename({'Biosample term name': 'biosample'}, axis=1, inplace=True)
df.loc[~df.new_name.isnull(), 'biosample'] = df.loc[~df.new_name.isnull(), 'new_name']

# biorep
temp = df[['Experiment accession', 'biosample', 'File accession']].groupby(['Experiment accession', 'biosample']).count().reset_index()
temp['biorep'] = temp.groupby('biosample').cumcount()+1
temp = temp[['Experiment accession', 'biorep']]
temp.biorep = temp.biorep.astype(str)
df = df.merge(temp, on='Experiment accession')

df['techrep'] = df.groupby('Experiment accession').cumcount()+1
df['hr'] = df['biosample'] + '_'+ df['biorep'].astype(str) +'_'+ df.techrep.astype(str)
df = df[['File accession', 'classification', 'hr']]
df['biosample'] = df.hr.str.rsplit('_', n=2, expand=True)[0]
df.to_csv("metadata_corrected.tsv", sep='\t', index=False)


254
254


In [24]:
df.loc[df['File accession'].duplicated()]

Unnamed: 0,File accession,classification,hr,biosample


In [25]:
df[['File accession', 'biosample']]

Unnamed: 0,File accession,biosample
0,ENCFF645YXV,lower_lobe_of_left_lung
1,ENCFF849AXD,h1
2,ENCFF404KPA,h1
3,ENCFF045TDA,h9_chondro
4,ENCFF156WYO,h9_chondro
...,...,...
249,ENCFF382SQY,brodmann_area_46
250,ENCFF504OTN,brodmann_area_46
251,ENCFF379DIZ,h1
252,ENCFF165FED,h1


In [26]:
# record the avg TPM value per biosample
tissue_df = get_tissue_metadata()
tissue_df = tissue_df[['tissue', 'biosample']]

df = df.merge(tissue_df, how='left', on='biosample')
df.loc[df.tissue.isnull(), 'tissue'] = df.loc[df.tissue.isnull(), 'biosample']
df.drop('biosample', axis=1, inplace=True)
df.rename({'tissue': 'biosample'}, axis=1, inplace=True)

In [27]:
df.head()

Unnamed: 0,File accession,classification,hr,biosample
0,ENCFF645YXV,tissue,lower_lobe_of_left_lung_2_1,lung
1,ENCFF849AXD,cell_line,h1_1_1,h1
2,ENCFF404KPA,cell_line,h1_1_2,h1
3,ENCFF045TDA,cell_line,h9_chondro_1_1,h9_chondro
4,ENCFF156WYO,cell_line,h9_chondro_1_2,h9_chondro


In [28]:
df.to_csv('file_to_sample.tsv', sep='\t', index=False)