In [1]:
from pathlib import Path
import pandas as pd

In [2]:
results_path = Path('/external/rprshnas01/netdata_kcni/stlab/he_human_processed/RSEM_results')

In [3]:
rsem_result_dirs = list(results_path.glob('*'))

In [4]:
len(rsem_result_dirs)

102

In [5]:
sample_dfs = [] 
for i, result in enumerate(rsem_result_dirs):
    sample_counts = result.glob('.genes.results')
    counts_path = list(sample_counts)
    
    if len(counts_path) > 0:
        #print(f'SampleID: {result.stem}, {counts_path}')
        try:
            df = pd.read_csv(counts_path[0], sep='\t')
            df = df.loc[:, ['gene_id', 'expected_count']]
            df.rename(columns={"expected_count": result.stem}, inplace=True)
            sample_dfs.append(df.set_index('gene_id'))
        except pd.io.common.EmptyDataError:
            print(f'EmptyDataError for file: {counts_path[0]}')
    else:
        print(result.stem)
    #print('**'*50)


In [7]:
concat = pd.concat(sample_dfs, axis=1)

In [8]:
# use this table to convert ENSG to gene_symbols
convert_genes = pd.read_table('../ENSG_to_gene_name.tsv', sep='\t', header=None, names=['gene_id', 'gene_symbol'])

In [9]:
convert_genes.head()

Unnamed: 0,gene_id,gene_symbol
0,ENSG00000000003,TSPAN6
1,ENSG00000000005,TNMD
2,ENSG00000000419,DPM1
3,ENSG00000000457,SCYL3
4,ENSG00000000460,C1orf112


In [10]:
counts_matrix = concat.merge(convert_genes, left_index=True, right_on=['gene_id']).set_index('gene_symbol').drop('gene_id', axis=1)

In [11]:
counts_matrix

Unnamed: 0_level_0,SRR2815999_RNA-Seq_of_Human_PFC_section_DS1-Human3-S12,SRR2815975_RNA-Seq_of_Human_PFC_section_DS1-Human2-S6,SRR2815970_RNA-Seq_of_Human_PFC_section_DS1-Human2-S1,SRR2816098_RNA-Seq_of_Human_PFC_section_DS2-HumanA-Rep1-S6,SRR2815998_RNA-Seq_of_Human_PFC_section_DS1-Human3-S11,SRR2815958_RNA-Seq_of_Human_PFC_section_DS1-Human1-S7,SRR2815980_RNA-Seq_of_Human_PFC_section_DS1-Human2-S11,SRR2816122_RNA-Seq_of_Human_PFC_section_DS2-HumanB-S10,SRR2816095_RNA-Seq_of_Human_PFC_section_DS2-HumanA-Rep1-S3,SRR2816110_RNA-Seq_of_Human_PFC_section_DS2-HumanA-Rep2-S8,...,SRR2815981_RNA-Seq_of_Human_PFC_section_DS1-Human2-S12,SRR2816020_RNA-Seq_of_Human_PFC_section_DS1-Human4-S15,SRR2816004_RNA-Seq_of_Human_PFC_section_DS1-Human3-S17,SRR2816016_RNA-Seq_of_Human_PFC_section_DS1-Human4-S11,SRR2816001_RNA-Seq_of_Human_PFC_section_DS1-Human3-S14,SRR2815984_RNA-Seq_of_Human_PFC_section_DS1-Human2-S15,SRR2815994_RNA-Seq_of_Human_PFC_section_DS1-Human3-S7,SRR2815991_RNA-Seq_of_Human_PFC_section_DS1-Human3-S4,SRR2816094_RNA-Seq_of_Human_PFC_section_DS2-HumanA-Rep1-S2,SRR2815985_RNA-Seq_of_Human_PFC_section_DS1-Human2-S16
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,58.00,68.00,314.00,43.00,79.0,47.00,57.00,63.00,82.00,73.00,...,58.00,52.00,43.00,52.00,68.00,0.00,66.00,68.00,47.00,92.00
TNMD,0.00,2.00,2.00,3.00,2.0,1.00,2.00,0.00,2.00,1.00,...,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0.00,1.00,0.00
DPM1,145.00,216.00,98.00,166.00,205.0,189.00,150.00,85.00,236.00,162.00,...,122.00,129.00,132.00,153.00,136.00,0.00,168.00,169.00,127.00,147.00
SCYL3,60.51,112.42,113.96,111.44,109.4,125.11,75.88,71.65,198.27,117.78,...,98.30,114.76,94.52,94.85,90.49,77.78,99.51,93.43,95.58,76.90
C1orf112,12.49,12.58,39.04,21.56,7.6,10.89,9.12,33.35,48.73,34.22,...,11.70,27.24,32.48,10.15,18.51,20.22,6.49,11.57,23.42,17.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC064824.1,26.08,22.67,20.37,24.77,21.8,31.16,27.83,25.44,29.89,26.48,...,23.06,31.90,61.55,31.55,47.92,0.00,4.82,20.00,13.85,20.69
AL136225.2,0.00,2.00,0.00,0.00,0.0,0.00,0.00,1.00,0.00,0.00,...,0.00,4.00,3.00,0.00,1.00,0.00,0.00,1.00,1.00,0.00
AC004636.1,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
AC007687.1,6.84,0.00,0.00,0.08,0.0,10.87,0.00,0.00,0.00,0.00,...,0.00,0.00,0.19,0.00,0.00,0.00,0.11,0.56,0.00,0.00


In [12]:
counts_matrix.to_csv('results/102_counts_matrix.csv')