In [39]:
import numpy as np 
import pandas as pd 
import os
import string
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

# Setup data for deep learning
## Create the y target

Metadata on the experiments includes the treament, but not the mechanism of action for the treatment. I extracted a list of all the unique treatments and hand annotated their mechanism of action based on domain knowledge and research. The additional feature of mechanism of action was then merge back into the metadata df. 

In [42]:
#Read in metadata
metadata = pd.read_csv('/Users/emilylorenzen/Downloads/rxrx2/metadata.csv')
print(metadata.head())

# Determined each unique treatment
treatment = pd.DataFrame(metadata.treatment.unique())

# Read treatment to new file to more easily annotate mechanisms - mechanisms were annotated via personal domain knowledge and information from databases such as UniProt
treatment.to_csv('treatment_MoA.csv')

# Merge annotated mechanisms with metadata
treatment_MoA = pd.read_csv('treatment_MoA - treatment_MoA (1).csv')
image_MoA = pd.merge(metadata, treatment_MoA, on = 'treatment')
# print(image_MoA.head())
# print(image_MoA.mechanism.value_counts())

# Save metadata df with mechanism of action to a file 
image_MoA.to_csv('image_MoA.csv')


# image_MoA_1_3 = image_MoA[(image_MoA.experiment == 'HUVEC-1') & (image_MoA.plate == 3)]

# metadata_1 = metadata[metadata.experiment == 'HUVEC-1']
# metadata_1[metadata_1.plate == 3]

            site_id         well_id cell_type experiment  plate  well  site  \
0  HUVEC-1_1_AA03_1  HUVEC-1_1_AA03     HUVEC    HUVEC-1      1  AA03     1   
1  HUVEC-1_1_AA03_2  HUVEC-1_1_AA03     HUVEC    HUVEC-1      1  AA03     2   
2  HUVEC-1_1_AA03_3  HUVEC-1_1_AA03     HUVEC    HUVEC-1      1  AA03     3   
3  HUVEC-1_1_AA03_4  HUVEC-1_1_AA03     HUVEC    HUVEC-1      1  AA03     4   
4  HUVEC-1_1_AA04_1  HUVEC-1_1_AA04     HUVEC    HUVEC-1      1  AA04     1   

        treatment  treatment_conc  
0  Sonic-Hedgehog             0.1  
1  Sonic-Hedgehog             0.1  
2  Sonic-Hedgehog             0.1  
3  Sonic-Hedgehog             0.1  
4           GDF-1             0.1  


In [132]:
image_MoA = pd.read_csv('image_MoA.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Get all images for particular experiment and plate, organize them, and get corresponding metadata

PNG files represent a particular well, site, and channel. I need to organize the channels so I can combine them to represent the complete image of a well and site combination when feeding the information to the deep learning algorithm. 


In [115]:
def image_paths(image_MoA, imgs_directory):
    
    # Rewrite 
    metadata = image_MoA
    metadata_plate = metadata[(metadata.experiment == 'HUVEC-1') & (metadata.plate == 9)]
    image_list = os.listdir(imgs_directory)
    return image_list, metadata_plate                              

In [116]:
image_list, metadata_plate = image_paths(image_MoA, '/Users/emilylorenzen/Downloads/rxrx2-2/images/HUVEC-1/Plate9')

In [117]:
def get_well_names(rows, columns, site_num, image_list):
    """ Takes in number of rows, columns, and images per well and returns a list of lists 
    with all channels of a particular image grouped together in an embedded list"""
        
    alphabet_string = string.ascii_uppercase
    alphabet_list = list(alphabet_string)
    alphabet_list.extend(['AA', 'AB', 'AC', 'AD', 'AE', 'AF'])
    alphabet_list = alphabet_list[0:rows]
    
    #48 columns, with 0 padding for digits under 10
    number_list = list(np.arange(1,49,1))
    number_list = [str(number_).zfill(2) for number_ in number_list]
    number_list = [str(number_) for number_ in number_list]
    number_list = number_list[0:columns]
    # Make tuples of all row and column combinations
    all_wells = [each_letter + each_number for each_letter in alphabet_list for each_number in number_list]
    
    # Get a list of all sites imaged per well 
    site_list = ['s' + str(i) for i in range (1, site_num+1)]
    
    # Create a list of lists that to combine all channels for a single image into an embedded list
    six_chan_list = []
    to_match_list = [well+ "_" + site for well in all_wells for site in site_list]
    for to_match in to_match_list:
#     print(to_match)
        result = [x for x in image_list if x.startswith(to_match)]
        six_chan_list.append(result)

    # Wells on the outer rim of the plate don't have data, so remove any empty lists in the six_chan_list
    six_chan_list = [sorted(x) for x in six_chan_list if x != []]  
    return six_chan_list


In [118]:
six_chan_list = get_well_names(32, 48, 4, image_list)

In [119]:
metadata_plate.mechanism.value_counts()

Growth factor                       708
Interleukin                         561
GPCR agonist                        464
Immunoglobulin                      311
Virulence factor                    167
TNF ligand                          155
Wnt agonist                         141
Complement protein                  108
Wnt antagonist                       72
Lectin                               72
B7 ligand                            71
STING ligand                         60
Tyrosine kinase receptor agonist     48
Antimicrobial peptide                24
protein kinase activator             24
Growth factor inhibitor              24
RLR ligand                           24
Immunoadhesion protein               24
Adhesion protein                     24
Wnt activator                        24
Fc receptor                          24
Granin protein                       12
Nod receptor ligand                  12
GPR                                  12
Cytokine receptor agonist            12


In [99]:
six_chan_list[0:5]

[['B02_s1_w1.png',
  'B02_s1_w2.png',
  'B02_s1_w3.png',
  'B02_s1_w4.png',
  'B02_s1_w5.png',
  'B02_s1_w6.png'],
 ['B02_s2_w1.png',
  'B02_s2_w2.png',
  'B02_s2_w3.png',
  'B02_s2_w4.png',
  'B02_s2_w5.png',
  'B02_s2_w6.png'],
 ['B02_s3_w1.png',
  'B02_s3_w2.png',
  'B02_s3_w3.png',
  'B02_s3_w4.png',
  'B02_s3_w5.png',
  'B02_s3_w6.png'],
 ['B02_s4_w1.png',
  'B02_s4_w2.png',
  'B02_s4_w3.png',
  'B02_s4_w4.png',
  'B02_s4_w5.png',
  'B02_s4_w6.png'],
 ['B03_s1_w1.png',
  'B03_s1_w2.png',
  'B03_s1_w3.png',
  'B03_s1_w4.png',
  'B03_s1_w5.png',
  'B03_s1_w6.png']]

Explore and process the cell images

In [100]:
from keras.preprocessing import image
plate3_HV2_dir = '/Users/emilylorenzen/Downloads/rxrx2-2/images/HUVEC-1/Plate9/'

# six_chan_array = np.empty((1024, 1024, 6))

final_array = np.empty((0, 100, 100, 6))
for complete_image in six_chan_list:
    six_chan_array = np.empty((100, 100, 0))
    for chan_ in complete_image:
        path_ = plate3_HV2_dir + chan_ 
        img_ = image.load_img(path_, color_mode = 'grayscale', target_size = (100, 100))
        img_array = image.img_to_array(img_)
#         print(img_array.shape)
        img_.close()


        six_chan_array = np.append(six_chan_array, img_array, axis = 2)
        final_six_chan_array = np.expand_dims(six_chan_array, axis=0)
    final_array = np.append(final_array, final_six_chan_array, axis = 0)
    print(final_array.shape)

(1, 100, 100, 6)
(2, 100, 100, 6)
(3, 100, 100, 6)
(4, 100, 100, 6)
(5, 100, 100, 6)
(6, 100, 100, 6)
(7, 100, 100, 6)
(8, 100, 100, 6)
(9, 100, 100, 6)
(10, 100, 100, 6)
(11, 100, 100, 6)
(12, 100, 100, 6)
(13, 100, 100, 6)
(14, 100, 100, 6)
(15, 100, 100, 6)
(16, 100, 100, 6)
(17, 100, 100, 6)
(18, 100, 100, 6)
(19, 100, 100, 6)
(20, 100, 100, 6)
(21, 100, 100, 6)
(22, 100, 100, 6)
(23, 100, 100, 6)
(24, 100, 100, 6)
(25, 100, 100, 6)
(26, 100, 100, 6)
(27, 100, 100, 6)
(28, 100, 100, 6)
(29, 100, 100, 6)
(30, 100, 100, 6)
(31, 100, 100, 6)
(32, 100, 100, 6)
(33, 100, 100, 6)
(34, 100, 100, 6)
(35, 100, 100, 6)
(36, 100, 100, 6)
(37, 100, 100, 6)
(38, 100, 100, 6)
(39, 100, 100, 6)
(40, 100, 100, 6)
(41, 100, 100, 6)
(42, 100, 100, 6)
(43, 100, 100, 6)
(44, 100, 100, 6)
(45, 100, 100, 6)
(46, 100, 100, 6)
(47, 100, 100, 6)
(48, 100, 100, 6)
(49, 100, 100, 6)
(50, 100, 100, 6)
(51, 100, 100, 6)
(52, 100, 100, 6)
(53, 100, 100, 6)
(54, 100, 100, 6)
(55, 100, 100, 6)
(56, 100, 100, 6)
(

In [13]:
final_array.shape

(5492, 100, 100, 6)

In [101]:
with open('plate_1_9.npy', 'wb') as f:
    np.save(f, final_array)