In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyranges as pr
import numpy as np

from collections import defaultdict
from Bio import SeqIO
import os
import random

from scipy.stats import dirichlet

# Subset GENCODE annotations to only include genes that are in GTEx data 

In [8]:
# read in the data with pyranges
gencode = pr.read_gtf("../resources/gencode.v26.basic.annotation.gtf.gz")

In [3]:
gtex = pr.read_gtf("../resources/references-v8-gencode.v26.GRCh38.genes.gtf")

In [6]:
# filter gtex to chr22 and feature=gene
gtex = gtex[gtex.Chromosome == "chr22"]
gtex = gtex[gtex.Feature == "gene"]
gtex

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,level,havana_gene,exon_id,exon_number,tag
0,chr22,HAVANA,gene,11066417,11068174,.,+,.,ENSG00000279973.2,ENSG00000279973.2,lincRNA,SC22CB-56B3.1,lincRNA,SC22CB-56B3.1,2,OTTHUMG00000192006.1,,,
1,chr22,HAVANA,gene,11124336,11125705,.,+,.,ENSG00000226444.2,ENSG00000226444.2,processed_pseudogene,ACTR3BP6,processed_pseudogene,ACTR3BP6,1,OTTHUMG00000140398.2,,,pseudo_consens
2,chr22,HAVANA,gene,12602465,12626642,.,+,.,ENSG00000283023.1,ENSG00000283023.1,unprocessed_pseudogene,FRG1GP,unprocessed_pseudogene,FRG1GP,2,OTTHUMG00000191578.1,,,
3,chr22,ENSEMBL,gene,15273854,15273961,.,+,.,ENSG00000276138.1,ENSG00000276138.1,snRNA,U6,snRNA,U6,3,,,,
4,chr22,HAVANA,gene,15290717,15297196,.,+,.,ENSG00000236235.1,ENSG00000236235.1,unprocessed_pseudogene,LA16c-13E4.3,unprocessed_pseudogene,LA16c-13E4.3,2,OTTHUMG00000140362.1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1252,chr22,ENSEMBL,gene,50691259,50691363,.,-,.,ENSG00000206841.1,ENSG00000206841.1,snRNA,RNU6-409P,snRNA,RNU6-409P,3,,,,
1253,chr22,HAVANA,gene,50735824,50738139,.,-,.,ENSG00000225929.1,ENSG00000225929.1,antisense,AC000036.4,antisense,AC000036.4,1,OTTHUMG00000150170.1,,,
1254,chr22,HAVANA,gene,50740592,50743520,.,-,.,ENSG00000254499.1,ENSG00000254499.1,antisense,AC002056.5,antisense,AC002056.5,2,OTTHUMG00000166231.1,,,
1255,chr22,HAVANA,gene,50754674,50755434,.,-,.,ENSG00000213683.4,ENSG00000213683.4,processed_pseudogene,AC002056.3,processed_pseudogene,AC002056.3,1,OTTHUMG00000150159.1,,,pseudo_consens


In [9]:
# filter gencode to chr22 and feature=gene, then subset to the same genes as gtex
gencode = gencode[gencode.Chromosome == "chr22"]
gencode = gencode[gencode.gene_id.isin(gtex.gene_id)]
gencode

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr22,HAVANA,gene,11066417,11068174,.,+,.,ENSG00000279973.2,lincRNA,...,,,,,,,,,,
1,chr22,HAVANA,transcript,11066417,11068174,.,+,.,ENSG00000279973.2,lincRNA,...,lincRNA,SC22CB-56B3.1-001,1,basic,OTTHUMT00000491988.1,,,,,
2,chr22,HAVANA,exon,11066417,11066515,.,+,.,ENSG00000279973.2,lincRNA,...,lincRNA,SC22CB-56B3.1-001,1,basic,OTTHUMT00000491988.1,1,ENSE00003758096.2,,,
3,chr22,HAVANA,exon,11067984,11068174,.,+,.,ENSG00000279973.2,lincRNA,...,lincRNA,SC22CB-56B3.1-001,1,basic,OTTHUMT00000491988.1,2,ENSE00003758861.2,,,
4,chr22,HAVANA,gene,11124336,11125705,.,+,.,ENSG00000226444.2,processed_pseudogene,...,,,,pseudo_consens,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34767,chr22,HAVANA,exon,50771209,50773109,.,-,.,ENSG00000079974.17,protein_coding,...,protein_coding,RABL2B-004,2,basic,OTTHUMT00000316609.1,6,ENSE00001522193.1,,ENSP00000378955.1,
34768,chr22,HAVANA,stop_codon,50773106,50773109,.,-,0,ENSG00000079974.17,protein_coding,...,protein_coding,RABL2B-004,2,basic,OTTHUMT00000316609.1,6,ENSE00001522193.1,,ENSP00000378955.1,
34769,chr22,HAVANA,UTR,50783500,50783630,.,-,.,ENSG00000079974.17,protein_coding,...,protein_coding,RABL2B-004,2,basic,OTTHUMT00000316609.1,1,ENSE00001522195.1,,ENSP00000378955.1,
34770,chr22,HAVANA,UTR,50782294,50782347,.,-,.,ENSG00000079974.17,protein_coding,...,protein_coding,RABL2B-004,2,basic,OTTHUMT00000316609.1,2,ENSE00003497028.1,,ENSP00000378955.1,


In [11]:
assert len(gencode.gene_id.unique()) == len(gtex.gene_id.unique())

In [12]:
# write gencode to a gtf file
gencode.to_gtf("../resources/gencode.v26.adjusted.basic.annotation.chr22.gtf.gz")

# Test Script
## Test gtf files and compare genes in GTEx 

In [26]:
# create a dictionary mapping gene name to # of transcripts
gene_to_tx_df = pd.read_csv(
    "../resources/chr22_l1hs_txome_v26/txome_t2g.tsv", sep="\t", header=None
)

In [15]:
OLD_gene_to_tx_df = pd.read_csv(
    "../resources/chr22_l1hs_txome/txome_t2g.tsv", sep="\t", header=None
)

In [3]:
# count duplicates in the gene column
gene_to_tx_values = gene_to_tx_df[1].value_counts()

In [4]:
# Mapping of genes to list of transcripts
gene_to_tx = gene_to_tx_df.groupby(1).apply(lambda x: x[0].tolist()).to_dict()

In [7]:
file_path = "../resources/GTEX/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct"
df = pd.read_csv(file_path, sep="\t", skiprows=2)

In [33]:
df

Unnamed: 0_level_0,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972.5,DDX11L1,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,1,1
ENSG00000227232.5,WASH7P,187,109,143,251,113,139,199,473,286,...,72,96,136,79,89,86,49,84,34,66
ENSG00000278267.1,MIR6859-1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000243485.5,MIR1302-2HG,1,0,0,1,0,0,0,0,0,...,0,0,1,0,2,2,0,1,0,0
ENSG00000237613.2,FAM138A,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000198695.2,MT-ND6,32048,164596,51756,7329,95273,18938,6831,15021,19433,...,463869,128711,140563,232890,268051,36256,24649,188858,303816,118025
ENSG00000210194.1,MT-TE,18,65,24,0,4,5,2,1,4,...,245,70,64,117,124,17,6,89,120,55
ENSG00000198727.2,MT-CYB,127194,638209,141359,171477,856541,402801,136828,169364,480357,...,1409155,1065891,728289,1000135,993209,378667,368432,485123,1231647,725879
ENSG00000210195.2,MT-TT,1,9,1,0,0,1,0,0,0,...,17,1,1,4,5,0,0,0,93,5


In [8]:
# Check out samples
metadata = pd.read_csv(
    "../resources/GTEX/annotations_v8_GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
    index_col=0,
    sep="\t",
)
metadata = metadata.loc[(list(df.columns)[2:]), "SMTS"]
samples = list(metadata.sample(frac=1, random_state=1).drop_duplicates().index)

In [18]:
# get overlap of genes in GTEx and our Txome
tx_genes = [gene.split(".")[0] for gene in set(OLD_gene_to_tx_df[1])]
df.reset_index(inplace=True)
df["Name_subset"] = df["Name"].apply(lambda x: x.split(".")[0])
genes_of_interest = set(df["Name_subset"]) & set(tx_genes)
df.set_index("Name_subset", inplace=True)  # CAN REPLACE THIS WITH "Name" but if so..
OTHER_chr22_df = df.loc[
    df.index.isin(genes_of_interest), samples
]  # replace with df["Name_subset"].isin.

In [27]:
genes_of_interest = set(df["Name"]) & set(gene_to_tx_df[1])
df.set_index("Name", inplace=True)
chr22_df = df.loc[df.index.isin(genes_of_interest), samples]

In [20]:
OTHER_chr22_df.index

Index(['ENSG00000277248', 'ENSG00000283047', 'ENSG00000279973',
       'ENSG00000226444', 'ENSG00000276871', 'ENSG00000283023',
       'ENSG00000276138', 'ENSG00000280341', 'ENSG00000236235',
       'ENSG00000279442',
       ...
       'ENSG00000100299', 'ENSG00000212569', 'ENSG00000251322',
       'ENSG00000206841', 'ENSG00000225929', 'ENSG00000100312',
       'ENSG00000254499', 'ENSG00000213683', 'ENSG00000184319',
       'ENSG00000079974'],
      dtype='object', name='Name_subset', length=1246)

In [24]:
len(set([gene.split(".")[0] for gene in chr22_df.index]) & set(OTHER_chr22_df.index))

1246

In [28]:
chr22_df

Unnamed: 0_level_0,GTEX-ZPCL-1026-SM-5GCOX,GTEX-ZXG5-0011-R7b-SM-57WCC,GTEX-14BIN-2126-SM-793AX,GTEX-1I4MK-1126-SM-B2LWS,GTEX-15RIF-1026-SM-7KUMZ,GTEX-ZVT4-0626-SM-5E45T,GTEX-1QCLZ-0326-SM-DTXG2,GTEX-132Q8-1426-SM-5EGK7,GTEX-1J1R8-1726-SM-ARL8M,GTEX-14BIN-3126-SM-664NJ,...,GTEX-WFJO-0926-SM-4LVM2,GTEX-15ER7-0926-SM-7KUMG,GTEX-12WSK-0226-SM-5BC62,GTEX-13OVI-1026-SM-5L3EM,GTEX-1K2DA-1226-SM-CGQGH,GTEX-11ONC-2226-SM-5HL6D,GTEX-1QW4Y-0126-SM-DPRZQ,GTEX-S32W-1626-SM-4AD6G,GTEX-S341-0826-SM-4AD73,GTEX-RN64-2426-SM-EZ6L2
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000277248.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000283047.1,1,0,0,2,1,0,0,2,2,2,...,1,2,2,4,7,3,0,2,1,2
ENSG00000279973.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
ENSG00000226444.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000276871.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000100312.10,89,12,16,21,23,18,43,252,109,30,...,11,246,10,166,77,1689,49,22,122,35
ENSG00000254499.1,0,0,1,0,0,0,0,2,2,0,...,0,3,0,0,0,1,0,0,0,0
ENSG00000213683.4,2,0,0,2,0,1,0,7,1,6,...,2,3,0,1,8,2,4,2,2,0
ENSG00000184319.15,438,314,271,211,973,58,89,151,195,344,...,138,231,102,143,835,439,326,355,217,936


In [32]:
len(set(gene_to_tx_df[1]))

1257

In [47]:
# dirichlet.rvs([1,1], size=1, random_state=1) # multiply this by counts
# get random int 0,1
cointoss = random.randint(0, 1)
if cointoss == 0:
    # use dirichlet to split counts between two isoforms
    # dirichlet.rvs([1,1], size=1, random_state=1) # multiply this by counts
    pass
else:
    # give all counts to a single isoform
    pass

0

In [None]:
counts = defaultdict(list)
# for a transcript
for tx in SeqIO.parse(txome.txome_fa, "fasta"):
    # make key that transcript
    counts["tx_id"].append(tx.id)
    for sample in range(0, 6):
        if "ENS" in tx.id:  # gene
            random_coeff = random.randint(0, 20)
            counts[sample].append(random_coeff * len(tx.seq) // 100)
        elif "chr" in tx.id:  # rmsk
            counts[sample].append(20 * len(tx.seq) // 100)
        else:
            counts[sample].append(0)

## Check if finished script works 


In [14]:
import os
import random
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
from Bio import SeqIO
from scipy.stats import dirichlet

from src.txome import Txome

In [15]:
OUTDIR = Path("../resources/chr22_l1hs_txome_v26")
GENOME_FA = Path("../resources/hg38.fa")
TX_GTF = Path("../resources/gencode.v26.adjusted.basic.annotation.chr22.gtf.gz")
RMSK_TSV = Path("../resources/hg38.rmsk.tsv")
GTEx_COUNTS_PATH = Path(
    "../resources/GTEX/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct"
)
GTEx_METADATA_PATH = Path(
    "../resources/GTEX/annotations_v8_GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
)

In [16]:
# create a dictionary mapping gene name to # of transcripts
gene_to_tx_df = pd.read_csv(
    "../resources/chr22_l1hs_txome_v26/txome_t2g.tsv", sep="\t", header=None
)
# Mapping of genes to list of transcripts
gene_to_tx = gene_to_tx_df.groupby(1).apply(lambda x: x[0].tolist()).to_dict()

In [17]:
# this is the longest to read in ~ 5min
full_GTEx = pd.read_csv(GTEx_COUNTS_PATH, sep="\t", skiprows=2)

# Randomly sample from each tissue
metadata = pd.read_csv(
    GTEx_METADATA_PATH,
    index_col=0,
    sep="\t",
)
metadata = metadata.loc[(list(full_GTEx.columns)[2:]), "SMTS"]
samples = list(metadata.sample(frac=1, random_state=1).drop_duplicates().index)

In [18]:
# Get overlaps of genes in GTEx and our Txome
genes_of_interest = set(full_GTEx["Name"]) & set(gene_to_tx_df[1])
full_GTEx.set_index("Name", inplace=True)
chr22_GTEx = full_GTEx.loc[full_GTEx.index.isin(genes_of_interest), samples]

In [22]:
chr22_GTEx.head()  # NOT PART OF SCRIPT

Unnamed: 0_level_0,GTEX-ZPCL-1026-SM-5GCOX,GTEX-ZXG5-0011-R7b-SM-57WCC,GTEX-14BIN-2126-SM-793AX,GTEX-1I4MK-1126-SM-B2LWS,GTEX-15RIF-1026-SM-7KUMZ,GTEX-ZVT4-0626-SM-5E45T,GTEX-1QCLZ-0326-SM-DTXG2,GTEX-132Q8-1426-SM-5EGK7,GTEX-1J1R8-1726-SM-ARL8M,GTEX-14BIN-3126-SM-664NJ,...,GTEX-WFJO-0926-SM-4LVM2,GTEX-15ER7-0926-SM-7KUMG,GTEX-12WSK-0226-SM-5BC62,GTEX-13OVI-1026-SM-5L3EM,GTEX-1K2DA-1226-SM-CGQGH,GTEX-11ONC-2226-SM-5HL6D,GTEX-1QW4Y-0126-SM-DPRZQ,GTEX-S32W-1626-SM-4AD6G,GTEX-S341-0826-SM-4AD73,GTEX-RN64-2426-SM-EZ6L2
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000277248.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000283047.1,1,0,0,2,1,0,0,2,2,2,...,1,2,2,4,7,3,0,2,1,2
ENSG00000279973.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
ENSG00000226444.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000276871.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
gene_to_tx_df[1].value_counts()  # NOT PART OF SCRIPT

ENSG00000229027.3     3
ENSG00000128285.4     3
ENSG00000278558.4     3
ENSG00000100170.9     3
ENSG00000161180.10    3
                     ..
ENSG00000217442.3     3
ENSG00000099957.16    3
ENSG00000075234.16    3
ENSG00000205704.6     3
ENSG00000100365.14    3
Name: 1, Length: 145, dtype: int64

In [23]:
chr22_GTEx.loc["ENSG00000280080.2", :]

GTEX-ZPCL-1026-SM-5GCOX          7
GTEX-ZXG5-0011-R7b-SM-57WCC      0
GTEX-14BIN-2126-SM-793AX         5
GTEX-1I4MK-1126-SM-B2LWS         4
GTEX-15RIF-1026-SM-7KUMZ         2
GTEX-ZVT4-0626-SM-5E45T          0
GTEX-1QCLZ-0326-SM-DTXG2         5
GTEX-132Q8-1426-SM-5EGK7         4
GTEX-1J1R8-1726-SM-ARL8M        15
GTEX-14BIN-3126-SM-664NJ         6
GTEX-WXYG-2626-SM-EVYAI         17
GTEX-1LNCM-1026-SM-E9U5D         4
GTEX-QESD-0006-SM-2I5G6          0
GTEX-1E1VI-1226-SM-7RHGD         4
GTEX-1IKOE-0926-SM-ARU7X         7
GTEX-1R9K4-0526-SM-E76P1        10
GTEX-15DCE-1626-SM-6LPJM         7
GTEX-13PVR-1226-SM-5RQJ2         6
GTEX-1GN73-1526-SM-9WPP1         3
GTEX-1E2YA-1626-SM-7MGXS         2
GTEX-WFJO-0926-SM-4LVM2         12
GTEX-15ER7-0926-SM-7KUMG        14
GTEX-12WSK-0226-SM-5BC62         2
GTEX-13OVI-1026-SM-5L3EM         8
GTEX-1K2DA-1226-SM-CGQGH         7
GTEX-11ONC-2226-SM-5HL6D       196
GTEX-1QW4Y-0126-SM-DPRZQ         6
GTEX-S32W-1626-SM-4AD6G          2
GTEX-S341-0826-SM-4A

In [100]:
# Voodoo: get realistic tx counts from GTEx genes
# ADJUSTED: manually set genes and sample to see if it works


counts = defaultdict(list)
gene = "ENSG00000280080.2"
counts["tx_id"].extend(gene_to_tx[gene])
sample = "GTEX-1R9K4-0526-SM-E76P1"


if len(gene_to_tx[gene]) == 1:
    counts[sample].append(chr22_GTEx.loc[gene, sample])
elif len(gene_to_tx[gene]) == 2:
    transcripts = gene_to_tx[gene]
    TPM = chr22_GTEx.loc[gene, sample]

    cointoss = random.randint(0, 1)
    if cointoss == 0:
        # use dirichlet to split counts between two isoforms
        distribution = (
            dirichlet.rvs([1, 1], size=1, random_state=1)[0] * TPM
        )  # multiply this by counts
    else:
        # give to one isoform
        distribution = [TPM, 0]

    np.random.shuffle(distribution)
    counts[sample].extend(list(distribution))

In [42]:
counts

defaultdict(list,
            {'tx_id': ['ENST00000640274.1', 'ENSG00000280080.2-I'],
             'GTEX-1R9K4-0526-SM-E76P1': [2.9751148897057096,
              7.024885110294291]})

In [46]:
gene_to_tx_df[1].value_counts()[
    gene_to_tx_df[1].value_counts() == 3
]  # NOT PART OF SCRIPT

ENSG00000229027.3     3
ENSG00000128285.4     3
ENSG00000278558.4     3
ENSG00000100170.9     3
ENSG00000161180.10    3
                     ..
ENSG00000217442.3     3
ENSG00000099957.16    3
ENSG00000075234.16    3
ENSG00000205704.6     3
ENSG00000100365.14    3
Name: 1, Length: 145, dtype: int64

In [47]:
chr22_GTEx.loc["ENSG00000100365.14", :]

GTEX-ZPCL-1026-SM-5GCOX         2638
GTEX-ZXG5-0011-R7b-SM-57WCC      147
GTEX-14BIN-2126-SM-793AX         172
GTEX-1I4MK-1126-SM-B2LWS        1444
GTEX-15RIF-1026-SM-7KUMZ         460
GTEX-ZVT4-0626-SM-5E45T          319
GTEX-1QCLZ-0326-SM-DTXG2          81
GTEX-132Q8-1426-SM-5EGK7         240
GTEX-1J1R8-1726-SM-ARL8M         221
GTEX-14BIN-3126-SM-664NJ         149
GTEX-WXYG-2626-SM-EVYAI          552
GTEX-1LNCM-1026-SM-E9U5D         669
GTEX-QESD-0006-SM-2I5G6        33800
GTEX-1E1VI-1226-SM-7RHGD         331
GTEX-1IKOE-0926-SM-ARU7X         842
GTEX-1R9K4-0526-SM-E76P1         396
GTEX-15DCE-1626-SM-6LPJM         173
GTEX-13PVR-1226-SM-5RQJ2         251
GTEX-1GN73-1526-SM-9WPP1         903
GTEX-1E2YA-1626-SM-7MGXS        1129
GTEX-WFJO-0926-SM-4LVM2         4572
GTEX-15ER7-0926-SM-7KUMG        1837
GTEX-12WSK-0226-SM-5BC62          62
GTEX-13OVI-1026-SM-5L3EM         326
GTEX-1K2DA-1226-SM-CGQGH         628
GTEX-11ONC-2226-SM-5HL6D         136
GTEX-1QW4Y-0126-SM-DPRZQ        6547
G

In [101]:
counts = defaultdict(list)
gene = "ENSG00000100365.14"
counts["tx_id"].extend(gene_to_tx[gene])
sample = "GTEX-12WSK-0226-SM-5BC62"


if len(gene_to_tx[gene]) > 2:
    # TPMs were either (i) split among three randomly chosen isoforms according to a flat Dirichlet distribution
    # (α = (1,1,1)) or (ii) attributed to a single isoform.
    transcripts = gene_to_tx[gene]
    TPM = chr22_GTEx.loc[gene, sample]

    cointoss = random.randint(0, 1)
    if cointoss == 0:
        # use dirichlet to split counts between two isoforms
        distribution = np.array(
            dirichlet.rvs([1, 1, 1], size=1, random_state=1)[0] * TPM
        )  # multiply this by counts
    else:
        # give to one isoform
        distribution = np.array([TPM, 0, 0])

    # if there is more than 3 transcripts, add zeros to the distribution
    if len(distribution) != len(gene_to_tx[gene]):
        np.pad(distribution, (len(gene_to_tx[gene]) - len(distribution)))

    # randomize
    np.random.shuffle(distribution)
    counts[sample].extend(list(distribution))
else:
    print("ERROR: gene has no transcripts")

In [71]:
counts

defaultdict(list,
            {'tx_id': ['ENST00000248899.10',
              'ENST00000397147.6',
              'ENSG00000100365.14-I'],
             'GTEX-12WSK-0226-SM-5BC62': [18.444549126880137,
              0.003909728996646027,
              43.55154114412322]})

In [86]:
gene_to_tx_df[1].value_counts()[gene_to_tx_df[1].value_counts() == 10]

ENSG00000184470.20    10
ENSG00000183597.15    10
ENSG00000196588.14    10
ENSG00000185133.13    10
Name: 1, dtype: int64

In [87]:
chr22_GTEx.loc["ENSG00000185133.13", :]

GTEX-ZPCL-1026-SM-5GCOX           39
GTEX-ZXG5-0011-R7b-SM-57WCC      219
GTEX-14BIN-2126-SM-793AX         783
GTEX-1I4MK-1126-SM-B2LWS        3734
GTEX-15RIF-1026-SM-7KUMZ         369
GTEX-ZVT4-0626-SM-5E45T           40
GTEX-1QCLZ-0326-SM-DTXG2         179
GTEX-132Q8-1426-SM-5EGK7        1355
GTEX-1J1R8-1726-SM-ARL8M         348
GTEX-14BIN-3126-SM-664NJ         987
GTEX-WXYG-2626-SM-EVYAI          133
GTEX-1LNCM-1026-SM-E9U5D         736
GTEX-QESD-0006-SM-2I5G6           27
GTEX-1E1VI-1226-SM-7RHGD         537
GTEX-1IKOE-0926-SM-ARU7X       59099
GTEX-1R9K4-0526-SM-E76P1        2517
GTEX-15DCE-1626-SM-6LPJM         911
GTEX-13PVR-1226-SM-5RQJ2          73
GTEX-1GN73-1526-SM-9WPP1         100
GTEX-1E2YA-1626-SM-7MGXS        1961
GTEX-WFJO-0926-SM-4LVM2         2090
GTEX-15ER7-0926-SM-7KUMG         253
GTEX-12WSK-0226-SM-5BC62         643
GTEX-13OVI-1026-SM-5L3EM         571
GTEX-1K2DA-1226-SM-CGQGH        4064
GTEX-11ONC-2226-SM-5HL6D         722
GTEX-1QW4Y-0126-SM-DPRZQ          36
G

In [102]:
counts = defaultdict(list)
gene = "ENSG00000185133.13"
counts["tx_id"].extend(gene_to_tx[gene])
sample = "GTEX-1GN73-1526-SM-9WPP1"


if len(gene_to_tx[gene]) > 2:
    # TPMs were either (i) split among three randomly chosen isoforms according to a flat Dirichlet distribution
    # (α = (1,1,1)) or (ii) attributed to a single isoform.
    transcripts = gene_to_tx[gene]
    TPM = chr22_GTEx.loc[gene, sample]

    cointoss = random.randint(0, 1)
    if cointoss == 0:
        # use dirichlet to split counts between two isoforms
        distribution = np.array(
            dirichlet.rvs([1, 1, 1], size=1, random_state=1)[0] * TPM
        )  # multiply this by counts
    else:
        # give to one isoform
        distribution = np.array([TPM, 0, 0])

    # if there is more than 3 transcripts, add zeros to the distribution
    if len(distribution) != len(gene_to_tx[gene]):
        distribution = np.pad(distribution, (len(gene_to_tx[gene]) - len(distribution)))

    # randomize
    np.random.shuffle(distribution)
    counts[sample].extend(list(distribution))
else:
    print("ERROR: gene has no transcripts")

In [103]:
counts

defaultdict(list,
            {'tx_id': ['ENST00000640274.1',
              'ENSG00000280080.2-I',
              'ENST00000248899.10',
              'ENST00000397147.6',
              'ENSG00000100365.14-I',
              'ENST00000331075.9',
              'ENST00000400294.6',
              'ENST00000401755.1',
              'ENST00000402238.5',
              'ENST00000404390.7',
              'ENST00000404453.5',
              'ENST00000405300.5',
              'ENST00000412277.6',
              'ENST00000620191.4',
              'ENSG00000185133.13-I'],
             'GTEX-1R9K4-0526-SM-E76P1': [10, 0],
             'GTEX-12WSK-0226-SM-5BC62': [0, 0, 62],
             'GTEX-1GN73-1526-SM-9WPP1': [0.0,
              0.0,
              0.0,
              0.0,
              0.0,
              0.0,
              70.24442120019874,
              29.749272785290543,
              0.0,
              0.0,
              0.0,
              0.0,
              0.0,
              0.006306014510719

In [130]:
# Voodoo: get realistic tx counts from GTEx genes
counts = defaultdict(list)
for gene in chr22_GTEx.index:
    counts["tx_id"].extend(gene_to_tx[gene])
    for sample in chr22_GTEx.columns:
        if len(gene_to_tx[gene]) == 1:
            counts[sample].append(chr22_GTEx.loc[gene, sample])
        elif len(gene_to_tx[gene]) == 2:
            transcripts = gene_to_tx[gene]
            TPM = chr22_GTEx.loc[gene, sample]

            cointoss = random.randint(0, 1)
            if cointoss == 0:
                # use dirichlet to split counts between two isoforms
                distribution = (
                    dirichlet.rvs([1, 1], size=1)[0] * TPM
                )  # multiply this by counts
            else:
                # give to one isoform
                distribution = [TPM, 0]

            np.random.shuffle(distribution)
            counts[sample].extend(distribution)

        elif len(gene_to_tx[gene]) > 2:
            # TPMs were either (i) split among three randomly chosen isoforms according to a flat Dirichlet distribution
            # (α = (1,1,1)) or (ii) attributed to a single isoform.
            transcripts = gene_to_tx[gene]
            TPM = chr22_GTEx.loc[gene, sample]

            cointoss = random.randint(0, 1)
            if cointoss == 0:
                # use dirichlet to split counts between two isoforms
                distribution = np.array(
                    dirichlet.rvs([1, 1, 1], size=1)[0] * TPM
                )  # multiply this by counts
            else:
                # give to one isoform
                distribution = np.array([TPM, 0, 0])

            # if there is more than 3 transcripts, add zeros to the distribution
            if len(distribution) != len(gene_to_tx[gene]):
                distribution = np.pad(
                    distribution, (0, (len(gene_to_tx[gene]) - len(distribution)))
                )

            # randomize
            np.random.shuffle(distribution)
            counts[sample].extend(distribution)
        else:
            print("ERROR: gene has no transcripts")


# save and simulate
counts = pd.DataFrame(counts).set_index("tx_id")

In [131]:
counts

Unnamed: 0_level_0,GTEX-ZPCL-1026-SM-5GCOX,GTEX-ZXG5-0011-R7b-SM-57WCC,GTEX-14BIN-2126-SM-793AX,GTEX-1I4MK-1126-SM-B2LWS,GTEX-15RIF-1026-SM-7KUMZ,GTEX-ZVT4-0626-SM-5E45T,GTEX-1QCLZ-0326-SM-DTXG2,GTEX-132Q8-1426-SM-5EGK7,GTEX-1J1R8-1726-SM-ARL8M,GTEX-14BIN-3126-SM-664NJ,...,GTEX-WFJO-0926-SM-4LVM2,GTEX-15ER7-0926-SM-7KUMG,GTEX-12WSK-0226-SM-5BC62,GTEX-13OVI-1026-SM-5L3EM,GTEX-1K2DA-1226-SM-CGQGH,GTEX-11ONC-2226-SM-5HL6D,GTEX-1QW4Y-0126-SM-DPRZQ,GTEX-S32W-1626-SM-4AD6G,GTEX-S341-0826-SM-4AD73,GTEX-RN64-2426-SM-EZ6L2
tx_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENST00000615943.1,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0
ENSG00000277248.1-I,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0
ENST00000635667.1,0.000000,0.0,0.000000,2.0,0.0,0.0,0.000000,0.000000,0.0,0.734351,...,0.714711,0.0,0.0,0.0,7.0,3.000000,0.0,0.000000,1.000000,0.0
ENSG00000283047.1-I,1.000000,0.0,0.000000,0.0,1.0,0.0,0.000000,2.000000,2.0,1.265649,...,0.285289,2.0,2.0,4.0,0.0,0.000000,0.0,2.000000,0.000000,2.0
ENST00000624155.2,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENST00000395593.7,0.000000,0.0,373.856157,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,802.024837,0.0,86.317377,590.726252,0.0
ENST00000395595.7,59.308932,0.0,381.950131,0.0,0.0,0.0,0.000000,0.000000,0.0,3814.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0
ENST00000395598.7,627.170804,0.0,0.000000,0.0,0.0,0.0,68.230725,632.220259,0.0,0.000000,...,0.000000,0.0,0.0,1601.0,0.0,0.000000,0.0,0.000000,254.342703,2584.0
ENST00000435118.5,0.000000,0.0,0.000000,0.0,0.0,0.0,210.586612,0.000000,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,1080.346179,0.000000,0.0


In [132]:
counts.shape[0]

3401

In [134]:
# create a dictionary mapping gene name to # of transcripts
gene_to_tx_df = pd.read_csv(
    "../resources/chr22_l1hs_txome_v26/txome_t2g.tsv", sep="\t", header=None
)