# **Gene Expression Analysis**

In this ipython notebook, cancer gene expression is analyzed from the CCLE database in order to identify the relative expression of CKLF and other chemokine markers associated with cancer.

Data was obtained from the [DepMap Public 23Q2 dataset](https://depmap.org/portal/download/all/).

The following input files were used:
1.   Model.csv - demographic information and cell line details
2.   OmicsExpressionProteinCodingGenesTPMLogp1.csv - tpm gene expression values for variety of genes
3.   Gene list.xlsx - list of desired chemokine markers



In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

#import statements
import pandas as pd, statistics, matplotlib.pyplot as plt, math, numpy as np
from scipy import stats
import glob
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#read input files
path = '/content/drive/MyDrive/Colab Notebooks/Feng lab/CCLE Database Gene Expression/'
CCLE_df =pd.read_csv(path + 'Input/OmicsExpressionProteinCodingGenesTPMLogp1.csv', header=0)
gene_df = pd.read_excel(path + 'Input/Gene list.xlsx', header=0)
gene_list = gene_df.columns.tolist()
sample_info =pd.read_csv(path +'Input/Model.csv', header=0)

# Create Excel Sheets for Relative Chemokine Expression of Each Cancer Type

In [3]:
#helper functions

#gets list of all cell lines for specified cancer type sorted alphabetically
def get_cell_lines(cancer_type):
  cancer_df = sample_info[sample_info['OncotreeSubtype'] == cancer_type]
  cell_lines = cancer_df.StrippedCellLineName.tolist()
  cell_lines.sort()
  return cell_lines

#gets list of corresponding gene names in CCLE_df based on gene_list
def get_correlated_gene_columns(gene_list):
  gene_list_formatted = [g + ' ' for g in gene_list]
  CCLE_genes = []
  for g in gene_list_formatted:
    for c in CCLE_df.columns[1:].tolist():
      if g in c:
        CCLE_genes += [c]
  return CCLE_genes

#pre-process CCLE_df to only have data for desired genes
def clean_CCLE_df(df, gene_list):
  CCLE_df = df.rename(columns={"Unnamed: 0": "Cell Line"})
  CCLE_genes = get_correlated_gene_columns(gene_list)
  CCLE_cols = ['Cell Line'] + CCLE_genes
  cleaned_CCLE = CCLE_df[CCLE_cols]
  return cleaned_CCLE

#returns full names of cell lines
def get_fullname_cell_lines(cancer_cell_lines):
  cell_line_pairs = dict(zip(sample_info['StrippedCellLineName'], sample_info['ModelID'],))
  fullname_cell_lines = [cell_line_pairs[c] for c in cancer_cell_lines]
  pairs_inverted = {y: x for x, y in cell_line_pairs.items()}
  return fullname_cell_lines, pairs_inverted

In [4]:
#main helper function to create an excel sheet from dataframe final_df2 with all desired chemokine markers for all cell lines within a specific cancer category CANCER_TYPE
def get_cancer_type(CANCER_TYPE, sample_info):
  sample_info = sample_info[['StrippedCellLineName', 'ModelID', 'PrimaryOrMetastasis', 'LegacyMolecularSubtype', 'OncotreeSubtype']]
  cancer_cell_lines = get_cell_lines(CANCER_TYPE)
  fullname_cell_lines, pairs_inverted = get_fullname_cell_lines(cancer_cell_lines)
  CCLE_genes = get_correlated_gene_columns(gene_list)
  cleaned_CCLE = clean_CCLE_df(CCLE_df, gene_list)
  cleaned_CCLE = cleaned_CCLE[cleaned_CCLE['Cell Line'].isin(fullname_cell_lines)]
  cancer_lines_in_df = cleaned_CCLE['Cell Line'].tolist()
  abbrev_cancer_cells = [pairs_inverted[c] for c in cancer_lines_in_df]

  cleaned_CCLE.drop(columns=['Cell Line'])
  cleaned_CCLE['Cell Line'] = abbrev_cancer_cells
  first_column = cleaned_CCLE.pop('Cell Line')
  cleaned_CCLE.insert(0, 'Cell Line', first_column)
  cleaned_CCLE = cleaned_CCLE.set_index('Cell Line')
  final_df = cleaned_CCLE.sort_index()

  sample_info2 = sample_info.set_index('StrippedCellLineName')
  sample_info2 = sample_info2.drop(columns=['OncotreeSubtype'])
  final_df2 = final_df.join(sample_info2)

  first_column = final_df2.pop('ModelID')
  final_df2.insert(0, 'ID', first_column)

  second_column = final_df2.pop('PrimaryOrMetastasis')
  final_df2.insert(1, 'Primary or Metastasis?', second_column)

  third_column = final_df2.pop('LegacyMolecularSubtype')
  final_df2.insert(2, 'MYCN Amp?', third_column)
  final_df2.to_excel(path+ '/Output/Cancer by Cell Lines 2023/' + CANCER_TYPE + ".xlsx")

  return final_df2


In [5]:
cancer_list = [x for x in sample_info.OncotreeSubtype.unique() if pd.isnull(x) == False] #list of all cancers in dataset

result_df = pd.DataFrame()

#for each cancer in datset, extract all associated cell lines and the corresponding gene expressions into result_df
for c in cancer_list:

  CANCER_TYPE = c
  sample_info = sample_info[['StrippedCellLineName', 'ModelID', 'PrimaryOrMetastasis', 'LegacyMolecularSubtype', 'OncotreeSubtype']]
  cancer_cell_lines = get_cell_lines(CANCER_TYPE)
  fullname_cell_lines, pairs_inverted = get_fullname_cell_lines(cancer_cell_lines)
  CCLE_genes = get_correlated_gene_columns(gene_list)
  cleaned_CCLE = clean_CCLE_df(CCLE_df, gene_list)
  cleaned_CCLE = cleaned_CCLE[cleaned_CCLE['Cell Line'].isin(fullname_cell_lines)]
  cancer_lines_in_df = cleaned_CCLE['Cell Line'].tolist()
  abbrev_cancer_cells = [pairs_inverted[c] for c in cancer_lines_in_df]

  cleaned_CCLE.drop(columns=['Cell Line'])
  cleaned_CCLE['Cell Line'] = abbrev_cancer_cells
  first_column = cleaned_CCLE.pop('Cell Line')
  cleaned_CCLE.insert(0, 'Cell Line', first_column)
  cleaned_CCLE = cleaned_CCLE.set_index('Cell Line')
  final_df = cleaned_CCLE.sort_index()

  sample_info2 = sample_info.set_index('StrippedCellLineName')
  final_df2 = final_df.join(sample_info2)

  first_column = final_df2.pop('ModelID')
  final_df2.insert(0, 'ID', first_column)
  result_df = result_df.append(final_df2)

In [6]:
#remove Na from cancer_list
cancer_list = [x for x in sample_info.OncotreeSubtype.unique() if pd.isnull(x) == False]

#for each cancer in cancer_list, create dataframe results_df storing gene expression data for all corresponding cell lines
for cancer in cancer_list:
  results_df = get_cancer_type(cancer.replace('/', '_'), sample_info)

# Create Summary Excel Sheet with Relative Chemokine Expressions for All Cancers Averaged over Cell Lines

In [7]:
#import all input excel sheets with gene expression for all cell lines corresponding to a cancer
out_path = path + 'Output/Cancer by Cell Lines 2023/'
files = os.listdir(out_path)
excel_files = [f for f in files if f[-4:] == 'xlsx']
excel_files.sort(key=str.lower)

In [8]:
#import a template excel sheet and write line by line the average expression for each chemokine marker over the cell lines per cancer
summary_df = pd.read_excel(path + 'Input/Summary_template.xlsx', header=0)
final_df = summary_df.copy(deep=True)

for file_name in excel_files:
  cancer_name = file_name[:file_name.index('.xlsx')]
  df =pd.read_excel(out_path + file_name, header=0)
  column_averages = df[df.columns[4:]].mean()

  specific_cancer_df = pd.DataFrame(column_averages).T
  specific_cancer_df.insert(0, 'Cancer Type', cancer_name)
  final_df = final_df.append(specific_cancer_df)
#final_df.to_excel(path + 'Output/Cancer Summary/Summary_Chemokines_2023.xlsx')

In [9]:
final_df

Unnamed: 0,Cancer Type,CXCL1 (2919),CXCL2 (2920),CXCL3 (2921),CXCL5 (6374),CXCL6 (6372),CXCL8 (3576),CXCL9 (4283),CXCL10 (3627),CXCL11 (6373),...,CCL25 (6370),CCL26 (10344),CCL27 (10850),CCL28 (56477),XCL1 (6375),CXCL1 (2919).1,CXCL2 (2920).1,XCL2 (6846),CX3CL1 (6376),CKLF (51192)
0,Acral Melanoma,2.593003,0.371519,0.683923,0.021186,0.017597,4.031780,0.091003,0.330517,0.381520,...,0.000000,0.824158,0.164767,2.512168,0.010661,2.593003,0.371519,0.068752,2.012378,5.901667
0,Activated B-cell Type,0.377169,0.061097,0.019000,0.169680,0.146963,0.221485,0.476476,3.025878,0.212429,...,0.028384,0.000000,0.070346,0.571865,0.000000,0.377169,0.061097,0.018861,0.993210,6.060300
0,Acute Leukemias of Ambiguous Lineage,,,,,,,,,,...,,,,,,,,,,
0,Acute Megakaryoblastic Leukemia,0.238320,0.392713,0.545122,0.084000,0.055516,1.177268,0.194783,0.405694,0.591092,...,0.315110,0.000000,0.062164,2.721691,0.063090,0.238320,0.392713,0.107062,0.035469,6.020178
0,Acute Monoblastic_Monocytic Leukemia,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Uterine Clear Cell Carcinoma,9.912485,5.915521,3.035624,9.537743,4.259272,6.090853,0.000000,0.028569,0.000000,...,0.000000,0.214125,0.028569,5.314697,0.704872,9.912485,5.915521,0.000000,3.969933,7.292230
0,Uterine Leiomyosarcoma,3.033341,2.437428,2.160733,1.834786,0.035195,3.920532,0.000000,0.028569,0.070128,...,0.231288,1.003598,0.229978,0.755025,0.000000,3.033341,2.437428,0.081749,0.219998,6.774027
0,Uterine Sarcoma_Mesenchymal,,,,,,,,,,...,,,,,,,,,,
0,Uveal Melanoma,0.464868,0.157041,0.646939,0.000000,0.001436,0.512663,0.023855,0.004264,0.012780,...,0.000000,0.370242,0.064510,2.870383,0.086711,0.464868,0.157041,0.029819,0.723781,5.790892
