# Finding Genetic Outliers in Copy Number Variation



In [1]:
# Install necessary packages
!pip install cptac
!pip install xlsxwriter

Collecting cptac
[?25l  Downloading https://files.pythonhosted.org/packages/19/71/1db2ae1e64d165e3bd30643d3490e95780fd0d0306232c5a367d647495ab/cptac-0.7.5-py3-none-any.whl (5.0MB)
[K     |████████████████████████████████| 5.0MB 4.3MB/s 
[?25hCollecting beautifulsoup4>=4.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)
[K     |████████████████████████████████| 112kB 38.6MB/s 
Collecting xlrd>=1.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/b0/16/63576a1a001752e34bf8ea62e367997530dc553b689356b9879339cf45a4/xlrd-1.2.0-py2.py3-none-any.whl (103kB)
[K     |████████████████████████████████| 112kB 32.8MB/s 
Collecting soupsieve>1.2
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4, xl

In [2]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import xlsxwriter
import cptac

  import pandas.util.testing as tm


## Download CNV data

In [3]:
cptac.download(dataset="endometrial")
en = cptac.Endometrial()
en_cnv = en.get_CNV()



In [4]:
cptac.download(dataset="gbm")
gbm = cptac.Gbm()
gbm_cnv = gbm.get_CNV()





In [5]:
cptac.download(dataset="brca")
brca = cptac.Brca()
brca_cnv = brca.get_CNV()



In [6]:
cptac.download(dataset="hnscc")
hnscc = cptac.Hnscc()
hnscc_cnv = hnscc.get_CNV()

Password for hnscc dataset: ··········


In [7]:
cptac.download(dataset="luad")
luad = cptac.Luad()
luad_cnv = luad.get_CNV()





In [8]:
cptac.download(dataset="ovarian")
ovarian = cptac.Ovarian()
ovarian_cnv = ovarian.get_CNV()



In [9]:
cptac.download(dataset="ccrcc")
ccrcc = cptac.Ccrcc()
ccrcc_cnv = ccrcc.get_CNV()



## Number of Patients with Sig Copy Number Variants per gene per cancer type

In [0]:
def separate(hiCut, lowCut, data):
  """
  Counts the Number of patients with high and low value for each gene.
  """
  sepDict = dict()
  for column in data:
    numHi = 0
    numLow = 0
    numNeu = 0
    for i in data[column]:
      if i > hiCut:
        numHi += 1
      elif i < lowCut:
        numLow += 1
      else:
        numNeu += 1 
    sepDict[column] = [numHi, numNeu, numLow]
  return sepDict

In [0]:
def get_counts_and_average(data):
  """
  Takes a CNV dataframe and generates a dataframe with the counts of high, low and neutral values
  """
  df = pd.DataFrame(data = separate(.3,-.2,data), index = ["high", "neutral", "low"])
  df = df.append(pd.DataFrame(data.mean(0), columns=['average']).transpose(), sort=True)
  return df

In [0]:
en_counts = get_counts_and_average(en_cnv)
gbm_counts = get_counts_and_average(gbm_cnv)
brca_counts = get_counts_and_average(brca_cnv)
hnscc_counts = get_counts_and_average(hnscc_cnv)
luad_counts = get_counts_and_average(luad_cnv)
ovarian_counts = get_counts_and_average(ovarian_cnv)
ccrcc_counts = get_counts_and_average(ccrcc_cnv)

In [0]:
# Save the BRCA dataframe for Figure 1

In [0]:
brca_counts.to_csv('brca_counts.csv')

## Finding Significant Genes in Each Cancer Type

In [0]:
def get_sig_genes(data):
  q25, q75 = np.quantile(data['high'], [0.25, 0.75])
  average_up = q75 + ((q75 - q25) * 1.5 )
  data_up = data['high'] > average_up
  return average_up, data[data_up]

In [0]:
en_sig_genes_cutoff, en_sig_genes = get_sig_genes(en_counts.transpose())
gbm_sig_genes_cutoff, gbm_sig_genes = get_sig_genes(gbm_counts.transpose())
brca_sig_genes_cutoff, brca_sig_genes = get_sig_genes(brca_counts.transpose())
hnscc_sig_genes_cutoff, hnscc_sig_genes = get_sig_genes(hnscc_counts.transpose())
luad_sig_genes_cutoff, luad_sig_genes = get_sig_genes(luad_counts.transpose())
ovarian_sig_genes_cutoff, ovarian_sig_genes = get_sig_genes(ovarian_counts.transpose())
ccrcc_sig_genes_cutoff, ccrcc_sig_genes = get_sig_genes(ccrcc_counts.transpose())

In [18]:
print(brca_sig_genes_cutoff)

41.0


In [0]:
# Create Cutoff DataFrame
cutoff = pd.DataFrame(data={'ENDO': en_sig_genes_cutoff,
             'GBM': [gbm_sig_genes_cutoff],
             'BRCA': [brca_sig_genes_cutoff],
             'HNSCC': [hnscc_sig_genes_cutoff],
             'LUAD': [luad_sig_genes_cutoff],
             'OVARIAN': [ovarian_sig_genes_cutoff],
             'CCRCC': [ccrcc_sig_genes_cutoff]})
cutoff.to_csv('cutoffs.csv')

In [0]:
# Create DataFrame of Sig Gene Lists
sig_genes_by_cancer = pd.concat([pd.DataFrame({'ENDO': list(en_sig_genes.index)}),
             pd.DataFrame({'GBM': list(gbm_sig_genes.index)}),
             pd.DataFrame({'BRCA': list(brca_sig_genes.index.get_level_values(0))}),
             pd.DataFrame({'HNSCC': list(hnscc_sig_genes.index)}),
             pd.DataFrame({'LUAD': list(luad_sig_genes.index)}),
             pd.DataFrame({'OVARIAN': list(ovarian_sig_genes.index)}),
             pd.DataFrame({'CCRCC': list(ccrcc_sig_genes.index.get_level_values(0))})], axis=1)
sig_genes_by_cancer.to_csv("sig_genes_by_cancer.csv")

## Finding Common Sig Genes

In [0]:
def CountFrequency(my_list): 
  """
  Counts the frequency of the gene in the given list.
  Returns a list of genes that appeared 3 or more times.
  """
  freq = {} 
  for items in my_list: 
      freq[items] = my_list.count(items) 
  sig = []
  for key, value in freq.items(): 
    if value > 3:
      sig.append(key)
  return sig

In [0]:
sig_genes = CountFrequency(list(gbm_sig_genes.index) + list(hnscc_sig_genes.index) + list(luad_sig_genes.index) + list(ovarian_sig_genes.index) + list(ccrcc_sig_genes.index.get_level_values(0)) + list(brca_sig_genes.index.get_level_values(0)) + list(en_sig_genes.index))

In [0]:
sig_genes_df = pd.DataFrame(data={'gene': sig_genes})
sig_genes_df.to_csv("sig_genes.csv")