## Matrix 합치기
- FASTQ로부터 align 한 다음 htseq count 한 raw count를 받아보자
- Single end : Fibula, Mandible/Maxilla/Tibia, ENCODE Osteocyte
- 우리 데이터 : "/data/project/OPLL/0.rnaraw/Analysis/~230316/00.PCA&Demographics/df/df_coladd.txt"


In [None]:
from Bio import SeqIO
from biomart import BiomartServer
import pandas as pd
import numpy as np
import os
import palettable
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr

# Import R packages in Python
base = importr('base')
utils = importr('utils')
deseq2 = importr('DESeq2')
# sva = importr ("sva")    # for Combat_seq

#from reComBat import reComBat

tabl = palettable.tableau.Tableau_20.mpl_colors

- GSE175236 (ENCODE Osteocyte),   GSE78608 (ENCODE Osteoblast), GSE227994 (Oxford_Ob)
- GSE220630 (Fibula, 5개), GSE149167 (maxilla, mandible, tibia)
- GSE188760 (OPLL, CSM, 2023, 4개), GSE69787(OPLL, Lig, 2016, 4개), GSE186542 (NP, 6개),
- GSE197172 (BM-MSC, 3개),  GSE102312 (BM-MSC_sb, 6개)

In [32]:
batch_list = []
batch_index = 0

# Display the filenames
for project in ["GSE175236", "GSE220630",  "GSE149167" , "GSE188760", "GSE69787", "GSE186542", "GSE197172"]:
    DIR =  "/data/project/OPLL/0.rnaraw/99.public/" + project + "/03.matrix"
    for filename in sorted(os.listdir(DIR))  :
        if os.path.isfile (os.path.join ( DIR, filename)) == False:
            continue
        if (project =="GSE149167") :
            if ( filename.split("_")[1].split(".tsv")[0] in (["4", "5", "6"])  ):
                continue
            
        if ("tsv" in filename) & ("norm_counts" not in filename):
            INPUT_TSV = DIR + "/" + filename
            #print ("{}\t{}".format (filename, df.shape ) )
            SAMPLE_ID = filename.split(".")[0]

            df = pd.read_csv (INPUT_TSV, sep = "\t", names = ["GENE_SYMBOL", SAMPLE_ID] )

            batch_list.append( batch_index )

            if len(batch_list) == 1:  # 맨 처음이라면
                df_concat = df
            else:
                df_concat = pd.merge (df_concat, df,  how = "outer").fillna (0)


    batch_index = batch_index + 1

In [33]:
# # GSE220630 (Fibula) : single end 라서 어쩔 수 없이 그냥 matrix를 합쳐주자

# import pandas as pd
# import numpy as np
# import pyensembl

# pyensemble = pyensembl.EnsemblRelease(77)   # release 77 uses human reference genome GRCh38

# INPUT_TSV = "/data/project/OPLL/0.rnaraw/99.public/GSE220630/03.matrix/GSE220630_norm_counts.tsv"
# df = pd.read_csv (INPUT_TSV, sep = "\t").iloc[ : , [0, 1, 6, 13, 19] ]  # 4개만 골라주자
# df.columns = ["ENSEMBL_ID", "Fibula_1", "Fibula_2", "Fibula_3", "Fibula_4"]
# df ["GENE_SYMBOL"] = ""

# for i in range (df.shape[0]):
#     E_ID = df.iloc [i, 0]
#     try:
#         #print ( "{}\t{}".format(E_ID, pyensemble.gene_name_of_gene_id( E_ID ) ) )
#         df.iloc[i, -1] = pyensemble.gene_name_of_gene_id( E_ID )
#     except:
#         #print ( "{} : No Gene Symbol".format (E_ID))
#         print ( "", end = "")
    
#     if i % 5000 == 0:
#         print ("", end = " ")


# df_concat = pd.merge (df_concat, df ,   left_on = "GENE_SYMBOL", right_on = "GENE_SYMBOL", how = "left").fillna (0)
# # Reorder DataFrame
# cols = df_concat.columns.tolist()
# cols.insert(1, cols.pop(cols.index('ENSEMBL_ID')))
# df_concat = df_concat[cols] 
# df_concat = df_concat.iloc[:-5, ]

# # Update batch_list
# batch_list = batch_list + [batch_index] * 4
# batch_index = batch_index + 1


### GTEX의 whole blood data 4개를 합쳐주자

In [34]:
df = pd.read_csv ( "/data/project/OPLL/0.rnaraw/99.public/GTEX/gene_reads_2017-06-05_v8_whole_blood.txt",  sep = "\t")

NUM_SELECT_GTEX = 4
df = df.iloc [:, [2] + list( range(3, 3 + NUM_SELECT_GTEX)) ]
df.columns = ["GENE_SYMBOL"] + ["WB_{}".format(i) for i in range(1, df.shape[1])]
#df.head()
df_concat = pd.merge (df_concat, df ,   left_on = "GENE_SYMBOL", right_on = "GENE_SYMBOL", how = "left").fillna (0)

batch_list = batch_list + [batch_index] * NUM_SELECT_GTEX
batch_index = batch_index + 1

## Our data

In [35]:
df = pd.read_csv ( "/data/project/OPLL/0.rnaraw/Analysis/~230316/00.PCA&Demographics/df/df_coladd.txt",  sep = "\t")
coldata = [ int(line.rstrip('\n')) for line in open("/data/project/OPLL/0.rnaraw/Analysis/~230316/00.PCA&Demographics/coldata/group.txt", 'r')]
batch_list = batch_list +  list ( coldata + np.max(batch_list) )

print ("Out data 개수 : {}\nbatch_list : {} ({}개)".format (len(coldata) , batch_list, len(batch_list)))

df_concat = pd.merge (df_concat, df ,   left_on = "GENE_SYMBOL", right_on = "GENE_SYMBOL", how = "left").fillna (0)

df_concat.iloc[:, 1:] = df_concat.iloc[:, 1:].astype(int)

Out data 개수 : 30
batch_list : [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 12, 8, 9, 10, 12, 8, 9, 10, 11, 8, 9] (71개)


In [36]:
#df_concat.drop(["ENSEMBL_ID"], axis = 1).to_csv ("/data/project/OPLL/0.rnaraw/99.public/df_concat.tsv", sep = "\t", index = False, header = True)
df_concat.to_csv ("/data/project/OPLL/0.rnaraw/99.public/df_concat.tsv", sep = "\t", index = False, header = True)
pd.DataFrame(batch_list).to_csv ("/data/project/OPLL/0.rnaraw/99.public/batch_list.tsv", sep = "\t", index = False, header = False)

In [37]:
df_concat

Unnamed: 0,GENE_SYMBOL,ENCODE_Os_1,ENCODE_Os_2,ENCODE_Os_3,ENCODE_Os_4,fibula_1,fibula_2,fibula_3,fibula_4,fibula_5,...,230207-NORMAL,230207-OLF,230207-SPINOUS,230222-LF,230222-NORMAL,230222-OLF,230222-SPINOUS,230316-DISH,230316-NORMAL,230316-OPLL
0,A1BG,0,102,91,90,30,21,41,17,80,...,298,812,388,173,393,771,105,1151,642,480
1,A1BG-AS1,1,41,28,50,13,3,17,3,14,...,0,11,0,60,54,6,0,1,55,12
2,A1CF,23,0,0,0,5,3,1,3,5,...,16,27,0,0,0,4,0,3,77,2
3,A2M,6,18,12,14,7667,7103,5452,2909,16884,...,15229,31998,20449,77759,14516,21324,6991,36400,52678,66246
4,A2M-AS1,4,18,26,17,39,43,23,16,66,...,0,0,0,82,10,11,0,26,68,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26861,__no_feature,6557435,5231787,5195437,5114512,6921560,5789783,6671101,5483469,7671085,...,0,0,0,0,0,0,0,0,0,0
26862,__ambiguous,1564,139460,139337,131957,437417,373195,458426,281404,499253,...,0,0,0,0,0,0,0,0,0,0
26863,__too_low_aQual,397138,2113523,1645713,2013244,4980010,5489078,5347630,3620553,6400390,...,0,0,0,0,0,0,0,0,0,0
26864,__not_aligned,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# 전체 raw read count의 총합 (data 총량)
t = pd.DataFrame ( np.sum(df_concat.iloc[ : ,1 : ], axis = 0)  )
t.astype ("int").T

Unnamed: 0,CSM_1,CSM_2,OPLL_1,OPLL_2,BM_1,BM_2,BM_3,NP_con_1,NP_con_2,NP_con_3,...,230207-NORMAL,230207-OLF,230207-SPINOUS,230222-LF,230222-NORMAL,230222-OLF,230222-SPINOUS,230316-DISH,230316-NORMAL,230316-OPLL
0,26461572,35110775,35277362,38755909,94826138,99398957,97591620,51135360,79018430,61898471,...,177248543,198142318,139643424,264841262,107705516,235989838,90419268,212121495,202178723,250859904
