In [9]:
import torch
import numpy as np
import pandas as pd
import os
import shutil
from PIL import Image
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

In [89]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
i = 550
train_df[i:i+10]

Unnamed: 0,id_code,experiment,plate,well,sirna
550,HEPG2-01_2_O21,HEPG2-01,2,O21,sirna_1097
551,HEPG2-01_2_O22,HEPG2-01,2,O22,sirna_109
552,HEPG2-01_2_O23,HEPG2-01,2,O23,sirna_59
553,HEPG2-01_3_B03,HEPG2-01,3,B03,sirna_848
554,HEPG2-01_3_B04,HEPG2-01,3,B04,sirna_337
555,HEPG2-01_3_B05,HEPG2-01,3,B05,sirna_467
556,HEPG2-01_3_B06,HEPG2-01,3,B06,sirna_432
557,HEPG2-01_3_B07,HEPG2-01,3,B07,sirna_1079
558,HEPG2-01_3_B08,HEPG2-01,3,B08,sirna_518
559,HEPG2-01_3_B09,HEPG2-01,3,B09,sirna_805


In [74]:
# Given id_code
given_id_code = 'HEPG2-01_3_B02'

# Find the corresponding row
selected_row = train_df.loc[train_df['id_code'] == given_id_code]
print(selected_row['sirna'].values)

[]


In [34]:
#this function moves all of the images inside train or test to within the main folder, 
# but renames the image to be the same format so that we can look it up in the csv
def move_and_rename_images(root_folder, destination_folder):
    for foldername, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:
            original_path = os.path.join(foldername, filename)
            if filename.startswith('.'):
                pass

            try:
                # Extract information from the original path
                _, _, cell_line, plate = foldername.split(os.path.sep)

                # Extract well information from the filename
                well_info, extension = os.path.splitext(filename)
                well, channel, rest = well_info.split('_')

                # Remove 'Plate' from the plate information
                plate_number = plate.replace('Plate', '')

                # Construct the new filename
                new_filename = f"{cell_line}_{plate_number}_{well}_{rest}{extension}"

                # Construct the new destination path
                new_destination_path = os.path.join(destination_folder, new_filename)

                # Move and rename the file
                shutil.move(original_path, new_destination_path)
            except Exception as e:
                print(f'{filename} not moved because of {e}')

# Example usage
# root_folder = 'data/train' #or test
# destination_folder = 'data/train' #or test
# move_and_rename_images(root_folder, destination_folder)


In [57]:
class ImageDataset(Dataset):
    def __init__(self, train=True, transform=None):
        postfix = 'train' if train else 'test'
        self.folder_path = f'data/{postfix}'
        self.df = pd.read_csv(f'data/train_with_target_id.csv') if train else pd.read_csv(f'data/{postfix}.csv')
        self.train = train
        if self.train:
            self.y = self.df['target_id'].values
        self.transform = transform
        self.file_list = self._get_file_list()

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        img_name = self.file_list[idx]
        img_path = os.path.join(self.folder_path, img_name)
        image = Image.open(img_path)
        if self.transform:
            image = self.transform(image)

        if self.train:
            print(f'Idx: {idx}')
            print(f'self.y[idx]: {self.y[idx]}')
            label = torch.tensor(self.y[idx])
            print(label.shape)
            return image, label
        
        return image

    def _get_file_list(self):
        try:
            files = [file for file in os.listdir(self.folder_path) if file.endswith('.png')]
            return files
        except FileNotFoundError:
            print(f"The folder '{self.folder_path}' does not exist.")
            return []

    def _extract_label(self, filename):
        # Implement logic to extract label from filename or path
        # For example, if filenames are in the format "class_label_image.png"
        file_name = filename.split('_')
        label = file_name[0] + '_' + file_name[1] + '_' + file_name[2]
        # selected_row = self.df.loc[self.df['experiment'] == label]
        # return selected_row['sirna'].values[0]
        return label
    
# Example usage:
# Assuming you have a 'train' folder with PNG files in it
custom_dataset = ImageDataset()

# Accessing dataset elements
for i in range(len(custom_dataset)):
    sample = custom_dataset[i]
    print(f"Image: {sample[0]}, Label: {sample[1]}")


Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70EB50>, Label: U2OS-01_1_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70ED90>, Label: U2OS-01_4_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70E550>, Label: HUVEC-01_3_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70EB50>, Label: HUVEC-01_3_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70ED90>, Label: U2OS-01_1_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70E550>, Label: U2OS-01_4_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70EB50>, Label: HUVEC-01_3_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70ED90>, Label: U2OS-01_4_B02
Image: <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x7FA9DA70E550>, Label: U2OS-01_1_B02
Image: <PIL.PngImagePlugin.PngImag

In [8]:
controls = pd.read_csv('data/train_controls.csv')
train = pd.read_csv('data/train.csv')
train_with_target_id = pd.concat([train, controls], axis=0).reset_index(drop=True)

In [10]:
#label encode the sirna column
label_encoder = LabelEncoder()
train_with_target_id['target_id'] = label_encoder.fit_transform(train_with_target_id['sirna'])

In [12]:
train_with_target_id.to_csv('data/train_with_target_id.csv', index=False)