In [1]:
import pandas as pd
import argparse
from ast import literal_eval
import numpy as np
import itertools
from io import BytesIO
import tqdm
import dask.dataframe as dd
from dask import delayed
import matplotlib.pyplot as plt

In [2]:
# load the clustered data /gpfs/commons/groups/knowles_lab/Karin/data/GTEx/clustered_junctions.h5
clusts = pd.read_hdf("/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/clustered_junctions.h5", key='df') # these have start-1 coordinates compared to original GTEx matrix

# make Name column to match GTEx file by first need to add "chr" before Chromosome column and subtract 1 from Start column 
clusts["Name"] = "chr" + clusts["Chromosome"].astype(str) + "_" + (clusts["Start"]+1).astype(str) + "_" + clusts["End"].astype(str)

In [3]:
# Remove singleton clusters where Count == 1
clusts = clusts[clusts["Count"] > 1]
len(clusts.Name.unique())

88308

In [4]:
# order clusts by descending count
clusts = clusts.sort_values(by="Count", ascending=False)
clusts.head()

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,junction_id,gene_name,Cluster,Count,Name
50173,3,100840858,100841997,-,ENSG00000154175,3_100840859_100841997,ABI3BP,37158,44,chr3_100840859_100841997
50136,3,100810477,100811229,-,ENSG00000154175,3_100810478_100811229,ABI3BP,37158,44,chr3_100810478_100811229
50144,3,100818581,100820219,-,ENSG00000154175,3_100818582_100820219,ABI3BP,37158,44,chr3_100818582_100820219
50143,3,100817495,100818524,-,ENSG00000154175,3_100817496_100818524,ABI3BP,37158,44,chr3_100817496_100818524
50142,3,100816768,100817435,-,ENSG00000154175,3_100816769_100817435,ABI3BP,37158,44,chr3_100816769_100817435


In [5]:
# Tot junc counts 
junc_counts = pd.read_csv("/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/GTEx_juncs_total_counts.txt", sep="\t")
junc_counts.columns = ["Name", "Junc_Counts"]
junc_counts = junc_counts.sort_values(by="Junc_Counts", ascending=False)
junc_counts.head()

Unnamed: 0,Name,Junc_Counts
188947,chr11_5225727_5226576,979442603
188948,chr11_5226800_5226929,666046850
122205,chr6_73517935_73518029,446680111
122206,chr6_73518265_73518353,366313147
195681,chr11_61965113_61965368,345749777


In [6]:
print(junc_counts[junc_counts["Name"] == "chr11_5226800_5226945"])
print(junc_counts[junc_counts["Name"] == "chr11_5226800_5226929"])

                         Name  Junc_Counts
188949  chr11_5226800_5226945      3004045
                         Name  Junc_Counts
188948  chr11_5226800_5226929    666046850


In [7]:
junc_counts[junc_counts["Junc_Counts"] > 15000000] #lots of highly expressed junctions ?

Unnamed: 0,Name,Junc_Counts
188947,chr11_5225727_5226576,979442603
188948,chr11_5226800_5226929,666046850
122205,chr6_73517935_73518029,446680111
122206,chr6_73518265_73518353,366313147
195681,chr11_61965113_61965368,345749777
...,...,...
235016,chr14_23418407_23419176,15009387
68492,chr3_50257091_50257499,15008536
266332,chr16_28538156_28538795,15003443
188301,chr11_1934656_1934828,15003373


In [8]:
junc_counts[junc_counts["Junc_Counts"]==0] #4400

Unnamed: 0,Name,Junc_Counts
71696,chr3_84802753_84868597,0
46385,chr2_108931033_108940210,0
355774,chrY_1196901_1198561,0
355780,chrY_1206600_1212555,0
355772,chrY_1191161_1193217,0
...,...,...
156272,chr8_102452013_102452431,0
156287,chr8_102688579_102688739,0
156353,chr8_103088956_103089422,0
156560,chr8_106216683_106217021,0


In [9]:
# gtex sample annotations 
samples = pd.read_csv("/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep="\t")
samples = samples[["SAMPID", "SMTS", "SMTSD"]].drop_duplicates()
samples.head()

Unnamed: 0,SAMPID,SMTS,SMTSD
0,GTEX-1117F-0003-SM-58Q7G,Blood,Whole Blood
1,GTEX-1117F-0003-SM-5DWSB,Blood,Whole Blood
2,GTEX-1117F-0003-SM-6WBT7,Blood,Whole Blood
3,GTEX-1117F-0011-R10a-SM-AHZ7F,Brain,Brain - Frontal Cortex (BA9)
4,GTEX-1117F-0011-R10b-SM-CYKQ8,Brain,Brain - Frontal Cortex (BA9)


In [10]:
# make a dataframe for each tissue type in SMTS column that has each sample ID and the tissue type with corresponding junctions and their counts 

In [34]:
clusts_simple = clusts[["Name", "Cluster", "gene_name"]].drop_duplicates()
# reset index in the dataframe
clusts_simple = clusts_simple.reset_index(drop=True)
clusts_simple.head()

Unnamed: 0,Name,Cluster,gene_name
0,chr3_100840859_100841997,37158,ABI3BP
1,chr3_100810478_100811229,37158,ABI3BP
2,chr3_100818582_100820219,37158,ABI3BP
3,chr3_100817496_100818524,37158,ABI3BP
4,chr3_100816769_100817435,37158,ABI3BP


In [35]:
import dask.dataframe as dd
gtex_juncs = '/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct'

class MeltedJunctions:
    def __init__(self, file_name, clusts_names, clusts, samples):
        self.file_name = file_name
        self.clusts_names = clusts_names
        self.clusts = clusts
        self.samples = samples
        
    def melt_junctions(self):
        melted_dfs = []
        
        # Read in the file as a Dask DataFrame
        dask_df = dd.read_csv(self.file_name, sample=1000000, sep="\t")
        
        # Skip the first two rows
        with open(self.file_name) as f:
            #next(f)
            #next(f)
            header = f.readline().strip().split("\t")
        
        print("Number of samples in the file: ", len(header))
        # Group the samples by tissue
        samples_df = self.samples

        # Keep only samples that are found in the header 
        samples_df = samples_df[samples_df['SAMPID'].isin(header)]
        grouped_samples = samples_df.groupby('SMTS')['SAMPID'].apply(list)
        # Iterate over the tissues and split the count matrix
        print("Iterating over tissues...")

        # Let's also only keep the junctions in our clusts_names list
        dask_df = dask_df[dask_df['Name'].isin(self.clusts_names)] # does this work?

        for tissue, samples in grouped_samples.items():
            print("Processing tissue: ", tissue)
            # Get the column indices for the samples in the current tissue
            sample_indices = [header.index(sample) for sample in samples]
            print("Number of samples in the current tissue: ", str(len(sample_indices)))
            # Extract the columns for the current tissue
            tissue_df = dask_df.iloc[:, [0,1] + sample_indices]
            # Add the tissue name as a column
            tissue_df['Tissue'] = tissue
            # Merge with cluster info to get Cluster ID 
            tissue_df = tissue_df.merge(self.clusts, on="Name", how="left")
            # Melt the dataframe
            tissue_df = tissue_df.melt(id_vars=['Name', 'Description', 'Tissue', 'gene_name', 'Cluster'], var_name='Sample', value_name='Count')
            # Remove rows with zero counts
            tissue_df = tissue_df[tissue_df['Count'] > 0]
            melted_dfs.append(tissue_df)
        
        print("Concatenating melted dataframes...")
        return melted_dfs

In [36]:
# create an instance of the class with the file name and clusts names as arguments
melted_junctions = MeltedJunctions(gtex_juncs, clusts.Name, clusts_simple, samples)

In [37]:
# call the melt_junctions method
melted_df = melted_junctions.melt_junctions()

Number of samples in the file:  17384
Iterating over tissues...
Processing tissue:  Adipose Tissue
Number of samples in the current tissue:  1204
Processing tissue:  Adrenal Gland
Number of samples in the current tissue:  258
Processing tissue:  Bladder
Number of samples in the current tissue:  21
Processing tissue:  Blood
Number of samples in the current tissue:  929
Processing tissue:  Blood Vessel
Number of samples in the current tissue:  1335
Processing tissue:  Brain
Number of samples in the current tissue:  2642
Processing tissue:  Breast
Number of samples in the current tissue:  459
Processing tissue:  Cervix Uteri
Number of samples in the current tissue:  19
Processing tissue:  Colon
Number of samples in the current tissue:  779
Processing tissue:  Esophagus
Number of samples in the current tissue:  1445
Processing tissue:  Fallopian Tube
Number of samples in the current tissue:  9
Processing tissue:  Heart
Number of samples in the current tissue:  861
Processing tissue:  Kidne

In [39]:
# now we have a list of dask dataframes, one for each tissue 
# we still need to melt each one such that each row has a single patient sample, junction ID and corresponding junction count 
melted_df[0].head(10)

Unnamed: 0,Name,Description,Tissue,gene_name,Cluster,Sample,Count
2,chr1_827776_829002,ENSG00000228794.8,Adipose Tissue,LINC01128,8097,GTEX-1117F-0226-SM-5GZZ7,15
4,chr1_829105_841199,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,5
5,chr1_829105_847653,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,7
6,chr1_829105_851926,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,8
7,chr1_847807_849483,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,4
8,chr1_847807_851926,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,3
9,chr1_849603_850177,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,1
10,chr1_849603_850180,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,3
11,chr1_849603_851926,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,1
12,chr1_850352_851926,ENSG00000228794.8,Adipose Tissue,LINC01128,8098,GTEX-1117F-0226-SM-5GZZ7,6


In [27]:
melted_df[0].index.compute()[0]

KeyboardInterrupt: 

In [None]:
# merge with clusts to get cluster id
df = melted_df.merge(clusts, on="Name", how="left")
df.head()

In [None]:
print(len(clusts.Name.unique()))

In [None]:
clusts.head()

In [None]:
#need to get total cluster counts for each cell 
clust_counts= df.groupby(["SAMPID", "Cluster"])["junc_count"].sum().reset_index()
clust_counts.style.hide_index()
clust_counts.columns = ['SAMPID', 'Cluster', 'Cluster_Counts']    
clust_counts.head()

In [None]:
list(df)
df.drop(["Chromosome", "Start", "End", "junction_id"], axis=1, inplace=True)
#df.drop("Chromosome")
#df.drop("Start")
#df.drop("End")
#df.drop("junction_id")

In [None]:
df.head()

In [None]:
summarized_data = clust_counts.merge(df) #combine cluster counts with junction counts 

In [None]:
#save file and use as input for LDA script 
summarized_data["junc_ratio"] = summarized_data["junc_count"] / summarized_data["Cluster_Counts"]
summarized_data['sample_id_index'] = summarized_data.groupby('SAMPID').ngroup()
summarized_data['junction_id_index'] = summarized_data.groupby('Name').ngroup()

In [None]:
summarized_data.head()

In [None]:
summarized_data.to_hdf("/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/GTEx_junction_cluster_counts" + ".h5", key='df', mode='w', format="table")

In [None]:
import os 
os.getcwd()