In [1]:
import pandas as pd
import math

In [2]:
#### get unique list of gene names 

# load in duplicated list of gene names
duplicated_genes = pd.read_csv('all_genes_duplicated.tsv',delimiter='\t', header=None)

# get unique genes from dataframe
genes = duplicated_genes.iloc[:,0].unique()

In [122]:
### formatting gene expression matrix for plotting

# load in gene expressioon matrix 
gene_expression_matrix = pd.read_csv('gene_expression_matrix.tsv',delimiter='\t')

# remove gene identifiers with ? 
gene_expression_matrix = gene_expression_matrix[~gene_expression_matrix['gene_id'].str.contains("\?")]

# remove extraneous information from gene_id column
gene_expression_matrix['gene_id'] = gene_expression_matrix['gene_id'].str.split("|").str[0]

# set the index of the dataframe to be the gene_id column
gene_expression_matrix.set_index("gene_id", inplace=True)

# subset the dataframe to contain only genes I'm interested in
missing_genes = ['AC097634.4', 'NSD2', 'KMT2D', 'AC127029.3', 'SRSF2', 'BORCS8-MEF2B', 'PAK5', 'U2AF1L5', 'KMT2C', 'ELOC']
genes_to_keep = list(set(genes) - set(missing_genes))
gene_expression_matrix = gene_expression_matrix.loc[genes_to_keep]

# reshape the dataframe 
gene_expression_matrix_reshaped = gene_expression_matrix.melt(var_name="sample", value_name="gene_expression", ignore_index=False)

# log transform the gene expression value
gene_expression_matrix_reshaped["gene_expression"] = gene_expression_matrix_reshaped["gene_expression"].apply(lambda x: math.log(x+1,2))

In [133]:
### adding in cancer types

# load in file with ids and cancer names and assign column names
tss_codes = pd.read_csv('TSS_codes.tsv', delimiter='\t', header=None, na_filter = False)
tss_codes.columns = ["cancer_id", 'group_name', "cancer_name", "group_id"]

# create a dictionary of cancer ids and cancer names
code_dict = dict(zip(list(tss_codes["cancer_id"]), list(tss_codes["cancer_name"])))

# create list of tumor types 
tumors_list = [code_dict[sample.split('-')[1]] for sample in list(gene_expression_matrix_reshaped['sample'])]

# create tumor column in gene expression dataframe 
gene_expression_matrix_reshaped["tumor"] = tumors_list

In [157]:
import plotly.express as px

fig = px.violin(gene_expression_matrix_reshaped.loc["BRCA2"], y="gene_expression", x="tumor", box=True)
fig.show()

In [159]:
gene_expression_matrix_reshaped.to_csv('formatted_gene_expression_matrix.csv')

In [160]:
gene_expression_matrix_reshaped

Unnamed: 0_level_0,sample,gene_expression,tumor
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BRD4,TCGA-OR-A5J1-01A-11R-A29S-07,10.062316,Adrenocortical carcinoma
HIST1H3H,TCGA-OR-A5J1-01A-11R-A29S-07,6.867328,Adrenocortical carcinoma
TET1,TCGA-OR-A5J1-01A-11R-A29S-07,5.737511,Adrenocortical carcinoma
VHL,TCGA-OR-A5J1-01A-11R-A29S-07,9.100744,Adrenocortical carcinoma
ESR1,TCGA-OR-A5J1-01A-11R-A29S-07,7.477621,Adrenocortical carcinoma
...,...,...,...
ARID2,TCGA-CG-4477-01A-01R-1157-13,9.888939,Stomach adenocarcinoma
AKT1,TCGA-CG-4477-01A-01R-1157-13,10.619889,Stomach adenocarcinoma
PIK3R2,TCGA-CG-4477-01A-01R-1157-13,9.783744,Stomach adenocarcinoma
XPO1,TCGA-CG-4477-01A-01R-1157-13,12.503094,Stomach adenocarcinoma


In [161]:
gene_expression_matrix.loc["PRDM14", "TCGA-OR-A5J1-01A-11R-A29S-07"]

0.0