In [1]:
import pandas as pd
import argparse
from ast import literal_eval
import numpy as np
import itertools
from io import BytesIO
import tqdm
import dask.dataframe as dd

In [2]:
# load the clustered data /gpfs/commons/groups/knowles_lab/Karin/data/GTEx/clustered_junctions.h5
clusts = pd.read_hdf("/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/clustered_junctions.h5", key='df') # these have start-1 coordinates compared to original GTEx matrix

# make Name column to match GTEx file by first need to add "chr" before Chromosome column and subtract 1 from Start column 
clusts["Name"] = "chr" + clusts["Chromosome"].astype(str) + "_" + (clusts["Start"]+1).astype(str) + "_" + clusts["End"].astype(str)

clusts.head()

Unnamed: 0,Chromosome,Start,End,Strand,gene_id,junction_id,gene_name,Cluster,Count,Name
0,1,169683625,169683755,+,ENSG00000000460,1_169683626_169683755,C1orf112,1,1,chr1_169683626_169683755
1,1,169798958,169800882,+,ENSG00000000460,1_169798959_169800882,C1orf112,5,3,chr1_169798959_169800882
2,1,169798958,169802620,+,ENSG00000000460,1_169798959_169802620,C1orf112,5,3,chr1_169798959_169802620
3,1,169800971,169802620,+,ENSG00000000460,1_169800972_169802620,C1orf112,5,3,chr1_169800972_169802620
4,1,169804240,169806003,+,ENSG00000000460,1_169804241_169806003,C1orf112,8,1,chr1_169804241_169806003


In [3]:
# Remove singleton clusters where Count == 1
clusts = clusts[clusts["Count"] > 1]
len(clusts.Name.unique())

141034

In [4]:
# gtex sample annotations 
samples = pd.read_csv("/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep="\t")
samples = samples[["SAMPID", "SMTS", "SMTSD"]].drop_duplicates()
samples.head()

Unnamed: 0,SAMPID,SMTS,SMTSD
0,GTEX-1117F-0003-SM-58Q7G,Blood,Whole Blood
1,GTEX-1117F-0003-SM-5DWSB,Blood,Whole Blood
2,GTEX-1117F-0003-SM-6WBT7,Blood,Whole Blood
3,GTEX-1117F-0011-R10a-SM-AHZ7F,Brain,Brain - Frontal Cortex (BA9)
4,GTEX-1117F-0011-R10b-SM-CYKQ8,Brain,Brain - Frontal Cortex (BA9)


In [37]:
from dataclasses import dataclass
from typing import List
import pandas as pd
gtex_juncs = '/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct'

class MeltedJunctions:
    def __init__(self, file_name, clusts_names):
        self.file_name = file_name
        self.clusts_names = clusts_names
        
    def melt_junctions(self, id_vars=['Name', 'Description']):
        melted_dfs = []
        
        print("hello 1")

        # extract the header from the file
        with open(self.file_name) as f:
            # Skip the first two rows
            next(f)
            next(f)
            header = f.readline().strip().split("\t")

        # Read in the file in chunks and melt each chunk individually
        for chunk in pd.read_csv(self.file_name, sep='\t', chunksize=2048, skiprows=[0,1], header=2):
            print("hello 2")
            # Ensure the chunk has the same header as the file
            chunk.columns = header
            # only keep the junctions that are in the clustered junctions
            chunk = chunk[chunk["Name"].isin(self.clusts_names)]
            melted_chunk = pd.melt(chunk, id_vars=id_vars, var_name='SAMPID', value_name='junc_count')
            melted_chunk['junc_count'] = melted_chunk['junc_count'].astype(int)
            melted_chunk = melted_chunk[melted_chunk['junc_count'] > 1]
            melted_dfs.append(melted_chunk)
            print(melted_chunk.head())
        
        # Concatenate all of the melted chunks into a single dataframe
        melted_df = pd.concat(melted_dfs, ignore_index=True)
        
        return melted_df

In [38]:
# create an instance of the class with the file name and clusts names as arguments
melted_junctions = MeltedJunctions(gtex_juncs, clusts.Name)

# call the melt_junctions method
melted_df = melted_junctions.melt_junctions()

hello 1
hello 2
                  Name        Description                    SAMPID  \
29  chr1_827776_829002  ENSG00000228794.8  GTEX-1117F-0226-SM-5GZZ7   
31  chr1_829105_841199  ENSG00000228794.8  GTEX-1117F-0226-SM-5GZZ7   
32  chr1_829105_847653  ENSG00000228794.8  GTEX-1117F-0226-SM-5GZZ7   
33  chr1_829105_851926  ENSG00000228794.8  GTEX-1117F-0226-SM-5GZZ7   
36  chr1_847807_849483  ENSG00000228794.8  GTEX-1117F-0226-SM-5GZZ7   

    junc_count  
29          15  
31           5  
32           7  
33           8  
36           4  
hello 2
                   Name         Description                    SAMPID  \
0  chr1_8015385_8015494  ENSG00000116285.12  GTEX-1117F-0226-SM-5GZZ7   
1  chr1_8015401_8015494  ENSG00000116285.12  GTEX-1117F-0226-SM-5GZZ7   
6  chr1_8335591_8337815  ENSG00000162426.14  GTEX-1117F-0226-SM-5GZZ7   
7  chr1_8355600_8356099  ENSG00000142599.17  GTEX-1117F-0226-SM-5GZZ7   
9  chr1_8356247_8358195  ENSG00000142599.17  GTEX-1117F-0226-SM-5GZZ7   

   junc_

In [None]:
clusts.head()

In [None]:
clusts[clusts["Name"] == "chr1_44777790_44777999"]

In [None]:
len(df.merge(clusts, on="Name", how="left").Name.unique())

In [None]:
# merge with clusts to get cluster id
df = df.merge(clusts, on="Name", how="left")
df.head()

In [None]:
#need to get total cluster counts for each cell 
clust_counts= df.groupby(["SAMPID", "Cluster"])["junc_count"].sum().reset_index()
clust_counts.style.hide_index()
clust_counts.columns = ['SAMPID', 'Cluster', 'Cluster_Counts']    
clust_counts.head()

In [None]:
list(df)
df.drop(["Chromosome", "Start", "End", "junction_id"], axis=1, inplace=True)
#df.drop("Chromosome")
#df.drop("Start")
#df.drop("End")
#df.drop("junction_id")

In [None]:
df.head()

In [None]:
summarized_data = clust_counts.merge(df)

#save file and use as input for LDA script 
summarized_data["junc_ratio"] = summarized_data["junc_count"] / summarized_data["Cluster_Counts"]
summarized_data['sample_id_index'] = summarized_data.groupby('SAMPID').ngroup()
summarized_data['junction_id_index'] = summarized_data.groupby('Name').ngroup()

In [None]:
summarized_data.head()

In [None]:
summarized_data.to_hdf("/gpfs/commons/groups/knowles_lab/Karin/data/GTEx/GTEx_junction_cluster_counts" + ".h5", key='df', mode='w', format="table")

In [None]:
import os 
os.getcwd()