In [1]:
import numpy as np
import pandas as pd

## Data input - Option 1 - Generate random data

In [2]:
# Generate data frame of random numbers with artificial ENSEMBL gene ID
np.seed = 1
number_of_rows = 30000
expression_df = pd.DataFrame({
    "ENSEMBL_Gene_ID": [f"ENSG{x:010}" for x in range(1, number_of_rows + 1)],
    "fold_change": np.random.randn(number_of_rows)* 5,
    "adjusted_p_value": np.random.randint(0, 10000, number_of_rows)/10000,
    "gene_length": np.abs(np.round((np.random.randn(number_of_rows)* 50000).astype(int), 0)),
    "chromosome": [f"Chr{chr_number}" for chr_number in np.random.choice(range(1, 24), size=number_of_rows)]})

In [3]:
expression_df.head(10)

Unnamed: 0,ENSEMBL_Gene_ID,fold_change,adjusted_p_value,gene_length,chromosome
0,ENSG0000000001,-3.093222,0.9598,109109,Chr5
1,ENSG0000000002,5.079444,0.0641,42231,Chr3
2,ENSG0000000003,0.361165,0.0244,47592,Chr7
3,ENSG0000000004,-6.597946,0.8434,94922,Chr16
4,ENSG0000000005,-0.775996,0.0339,53489,Chr8
5,ENSG0000000006,0.164095,0.8092,17486,Chr9
6,ENSG0000000007,9.507698,0.7899,28534,Chr8
7,ENSG0000000008,-0.229587,0.9606,12830,Chr6
8,ENSG0000000009,15.503495,0.3538,52764,Chr10
9,ENSG0000000010,-2.061291,0.8111,23617,Chr4


In [4]:
output_fh = open("Expression_data.csv", "w")
output_fh.write("# Randomly generate gene expression data\n")
expression_df.to_csv(output_fh, sep="\t", index=False)

## Data input - Option 2 - Download (randomly generated) data 

In [5]:
import urllib.request
source_url = "https://github.com/foerstner-lab/Bits_and_pieces_for_the_carpentries_workshops/blob/main/python/Expression_data.csv?raw=true"
expression_data_file = "Expression_data.csv"

In [6]:
expression_df = pd.read_csv(expression_data_file, sep="\t", comment="#")

## The actual filtering

In [7]:
expression_df

Unnamed: 0,ENSEMBL_Gene_ID,fold_change,adjusted_p_value,gene_length,chromosome
0,ENSG0000000001,-3.093222,0.9598,109109,Chr5
1,ENSG0000000002,5.079444,0.0641,42231,Chr3
2,ENSG0000000003,0.361165,0.0244,47592,Chr7
3,ENSG0000000004,-6.597946,0.8434,94922,Chr16
4,ENSG0000000005,-0.775996,0.0339,53489,Chr8
...,...,...,...,...,...
29939,ENSG0000029940,-1.977256,0.6678,959,Chr7
29940,ENSG0000029941,1.229374,0.1838,123072,Chr4
29941,ENSG0000029942,2.232070,0.7865,49962,Chr3
29942,ENSG0000029943,3.716701,0.1706,39327,Chr16


In [8]:
expression_df.describe()

Unnamed: 0,fold_change,adjusted_p_value,gene_length
count,29944.0,29944.0,29944.0
mean,0.004812,0.50208,40053.986942
std,5.009114,0.289636,30281.730134
min,-18.546675,0.0,0.0
25%,-3.377455,0.248875,15990.5
50%,0.018451,0.5035,33806.5
75%,3.426894,0.7562,57709.5
max,19.80534,0.9999,199697.0


In [9]:
gene_ids = ["ENSG0000000005","ENSG0000000006","ENSG0000000007","ENSG0000000008","ENSG0000000009"]
selection_by_gene_name = expression_df[expression_df["ENSEMBL_Gene_ID"].isin(gene_ids)]
selection_by_gene_name

Unnamed: 0,ENSEMBL_Gene_ID,fold_change,adjusted_p_value,gene_length,chromosome
4,ENSG0000000005,-0.775996,0.0339,53489,Chr8
5,ENSG0000000006,0.164095,0.8092,17486,Chr9
6,ENSG0000000007,9.507698,0.7899,28534,Chr8
7,ENSG0000000008,-0.229587,0.9606,12830,Chr6
8,ENSG0000000009,15.503495,0.3538,52764,Chr10


In [10]:
# Write to file
selection_by_gene_name.to_csv("Expression_data_selection.csv", sep="\t", index=False)

In [11]:
selection_by_adj_p_value = expression_df[expression_df["adjusted_p_values"] < 0.05]

KeyError: 'adjusted_p_values'

In [None]:
selection_by_adj_p_value

In [None]:
expression_df.fold_change.hist()