## ColExpand

In [1]:
import pandas as pd

In [2]:
columns = [
    'SRP', 'ACCESSION', 'RUN_SET.RUN.IDENTIFIERS.PRIMARY_ID', 
    'SAMPLE.SAMPLE_NAME.TAXON_ID', 'Pool.Member.@sample_title', 'TAG', 'VALUE']
dF = pd.read_table('SRP_dump/TE_SRP_expanded.csv',delimiter=',') 
dF = pd.DataFrame(dF, columns=columns)

In [3]:
tags = list(set(dF['TAG']))

In [5]:
groups = dF.groupby(['ACCESSION']).groups
expanded = []
for acc in groups:
    gr = dF.iloc[[*groups[acc]]]
    obj = dict(zip(gr['TAG'], gr['VALUE']))
    for col in columns[0:-2]:
        obj[col] = list(gr[col])[0]
    expanded.append({k:obj[k] for k in obj if obj[k] != ""})

In [49]:
pd.DataFrame(expanded, columns=columns[0:-2] + tags).to_csv("SRP_dump/TE_SRP_tagvalue.tsv", index=False, sep="\t")

In [44]:
def counts(obj):
    cntx = {}
    for c in list(obj):
        if c not in cntx: cntx[c] = 0
        cntx[c] += 1
    return sorted(cntx.items(), key=lambda item: item[1], reverse=True)

filtred = []
stats = {}
total = max(dF.count())
for tag in tags:
    cntx = counts(dF[dF.TAG == tag]['VALUE'])
    empty = 100 * ( 1 - sum([v for k,v in cntx])/total )
    stats[tag] = empty
    if empty < 99.5:
        filtred.append(tag)
    print(f"# {tag} (EMPTY: {empty:.2f}%)")
    for k, v in cntx:
        print(f"   ({v})  {k}")

[(k,v) for k,v in sorted(stats.items(), key=lambda item: item[1])]

# replicate (EMPTY: 96.10%)
   (19)  Replicate 2
   (19)  Replicate 1
   (16)  Replicate 3
   (10)  Replicate #2
   (10)  Replicate #1
   (6)  3
   (6)  2
   (6)  1
# infection (EMPTY: 99.83%)
   (1)  PRMT5 shRNA
   (1)  WDR77 shRNA
   (1)  Scrambled control shRNA_2
   (1)  Scrambled control shRNA_1
# sequencing_library_method (EMPTY: 99.96%)
   (1)  Illumina Nextera XT
# lab_host (EMPTY: 99.96%)
   (1)  Bos taurus
# passage_history (EMPTY: 99.96%)
   (1)  Passaged once in LFBK-_V_6 cells
# serotype (EMPTY: 99.96%)
   (1)  A
# case history of originator (EMPTY: 99.75%)
   (6)  HBs-Ag positive hepato cellular carcinoma (Ed.III) with liver cirrhosis. HBs-Ag(+), HBs-Ab(-), HBe-Ag(-), HBe-Ab(+), HBc-Ab(+), AFP positive (110,000 ng/ml).
# genotype/variation (EMPTY: 98.09%)
   (12)  expressing sgRNA targeting PRMT5
   (7)  PRMT5 knockdown
   (6)  No knockdown
   (3)  shPRMT5
   (3)  shCTRL
   (2)  Expressing Dox inducible GFP-SMN fusion protein. Construct is hygromycin resistant.
   (2)  Exp

[('source_name', 77.18035563082134),
 ('cell line', 82.38780694326842),
 ('treatment', 86.70618120237087),
 ('cell type', 93.35309060118544),
 ('tissue', 95.76629974597799),
 ('replicate', 96.10499576629975),
 ('genotype', 97.20575783234547),
 ('tissue source', 97.45977984758679),
 ('time point', 97.883149872989),
 ('barcode', 97.96782387806944),
 ('disease', 97.96782387806944),
 ('genotype/variation', 98.09483488569009),
 ('chip antibody', 98.5182049110923),
 ('treated with', 98.64521591871296),
 ('knockdown', 98.94157493649449),
 ('conditions', 99.02624894157493),
 ('overexpression', 99.06858594411516),
 ('tumor type', 99.15325994919559),
 ('shRNA', 99.15325994919559),
 ('mtap status', 99.15325994919559),
 ('biological replicates', 99.32260795935647),
 ('time', 99.32260795935647),
 ('agent', 99.32260795935647),
 ('condition', 99.32260795935647),
 ('stress', 99.49195596951735),
 ('polysome fraction', 99.49195596951735),
 ('reference genome', 99.49195596951735),
 ('ectopic e2f-1 expres

In [50]:
pd.DataFrame(expanded, columns=columns[0:-2] + filtred).to_csv("SRP_dump/TE_SRP_tagvalue_filterd.tsv", index=False, sep="\t")