In [62]:
import numpy as np
import pandas as pd

## Data input - Option 1 - Generate random data

In [63]:
# Generate data frame of random numbers with artificial ENSEMBL gene ID
np.seed = 1
number_of_rows = 30000
expression_df = pd.DataFrame({
    "ENSEMBL_Gene_ID": [f"ENSG{x:010}" for x in range(1, number_of_rows + 1)],
    "fold_changes": np.random.randn(number_of_rows)* 5,
    "adjusted_p_values": np.random.randint(0, 10000, number_of_rows)/10000,
    "gene_length": np.abs(np.round((np.random.randn(number_of_rows)* 50000).astype(int), 0)),
    "chromosome": [f"Chr{chr_number}" for chr_number in np.random.choice(range(1, 24), size=number_of_rows)]})

In [64]:
expression_df.head(10)

Unnamed: 0,ENSEMBL_Gene_ID,fold_changes,adjusted_p_values,gene_length,chromosome
0,ENSG0000000001,-3.132966,0.0665,12682,Chr15
1,ENSG0000000002,2.924621,0.929,6862,Chr1
2,ENSG0000000003,3.212987,0.268,1499,Chr17
3,ENSG0000000004,8.438955,0.2617,41620,Chr7
4,ENSG0000000005,-4.502669,0.3798,93979,Chr16
5,ENSG0000000006,-0.236593,0.1638,4670,Chr14
6,ENSG0000000007,-6.255791,0.823,94159,Chr22
7,ENSG0000000008,0.343153,0.1177,41263,Chr12
8,ENSG0000000009,-8.103449,0.6204,81,Chr21
9,ENSG0000000010,2.576576,0.9971,52731,Chr17


In [65]:
output_fh = open("Expression_data.csv", "w")
output_fh.write("# Randomly generate gene expression data\n")
expression_df.to_csv(output_fh, sep="\t", index=False)

## Data input - Option 2 - Download (randomly generated) data 

In [66]:
import urllib.request
source_url = "https://github.com/foerstner-lab/Bits_and_pieces_for_the_carpentries_workshops/blob/master/python/Expression_data.csv?raw=true"
expression_data_file = "Expression_data.csv"

In [67]:
expression_df = pd.read_csv(expression_data_file, sep="\t", comment="#")

## The actual filtering

In [68]:
expression_df

Unnamed: 0,ENSEMBL_Gene_ID,fold_changes,adjusted_p_values,gene_length,chromosome
0,ENSG0000000001,-3.132966,0.0665,12682.0,Chr15
1,ENSG0000000002,2.924621,0.929,6862.0,Chr1
2,ENSG0000000003,3.212987,0.268,1499.0,Chr17
3,ENSG0000000004,8.438955,0.2617,41620.0,Chr7
4,ENSG0000000005,-4.502669,0.3798,93979.0,Chr16
5,ENSG0000000006,-0.236593,0.1638,4670.0,Chr14
6,ENSG0000000007,-6.255791,0.823,94159.0,Chr22
7,ENSG0000000008,0.343153,0.1177,41263.0,Chr12
8,ENSG0000000009,-8.103449,0.6204,81.0,Chr21
9,ENSG0000000010,2.576576,0.9971,52731.0,Chr17


In [69]:
gene_ids = ["ENSG0000000005","ENSG0000000006","ENSG0000000007","ENSG0000000008","ENSG0000000009"]
selection = expression_df[expression_df["ENSEMBL_Gene_ID"].isin(gene_ids)]
selection

Unnamed: 0,ENSEMBL_Gene_ID,fold_changes,adjusted_p_values,gene_length,chromosome
4,ENSG0000000005,-4.502669,0.3798,93979.0,Chr16
5,ENSG0000000006,-0.236593,0.1638,4670.0,Chr14
6,ENSG0000000007,-6.255791,0.823,94159.0,Chr22
7,ENSG0000000008,0.343153,0.1177,41263.0,Chr12
8,ENSG0000000009,-8.103449,0.6204,81.0,Chr21


In [73]:
# Write to file
selection.to_csv("Expression_data_selection.csv", sep="\t", index=False)