In [6]:
import gzip
import os
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

In [7]:
sample_data = pd.read_csv('../data/OSD-466-samples.csv')
sample_name_list = np.unique(sample_data['Sample Name'])
sample_data.shape

(62, 29)

In [12]:
sample_data.columns

Index(['Source Name', 'Sample Name', 'Characteristics: Organism',
       'Characteristics: Host Organism', 'Characteristics: Host Strain',
       'Comment: Animal Source', 'Factor Value: Genotype',
       'Characteristics: Age at Launch', 'Characteristics: Age at Euthanasia',
       'Characteristics: sex', 'Characteristics: diet',
       'Comment: Feeding Schedule', 'Factor Value: Spaceflight',
       'Characteristics: Material Type', 'Protocol REF',
       'Parameter Value: habitat', 'Parameter Value: Enrichment Material',
       'Parameter Value: duration', 'Parameter Value: light cycle',
       'Parameter Value: Euthanasia Method',
       'Parameter Value: Carcass Preservation Method', 'Protocol REF.1',
       'Parameter Value: Sample Preservation Method',
       'Parameter Value: Sample Storage Temperature', 'Comment: RFID',
       'Comment: ALSDA Subject ID', 'Comment: Euthanasia Date',
       'Comment: BSP Dissection Date', 'Comment: Source Description'],
      dtype='object')

In [13]:
directory = '../data/gz'
notation_list = []
dna_list = []
mass_list = []
filename_list = []
label_list = []
LIMIT = 3000
label_name = 'Factor Value: Spaceflight'

if LIMIT:
    LIMIT = LIMIT - LIMIT % 4 - 1

for filename in tqdm(os.listdir(directory)):
    if filename.endswith('.gz'):
        for name in sample_name_list:
            if name in filename:
                label = sample_data[sample_data['Sample Name'] == name][label_name].values[0]
                break
        with gzip.open(os.path.join(directory, filename), 'rb') as file:
            count = 0
            c = 0
            for line in file:
                line = line.decode('utf-8')
                line = line.replace('\n', '')
                if count % 4 == 0:
                    notation_list.append(line[1:])
                if count % 4 == 1:
                    dna_list.append(line)
                if count % 4 == 3:
                    mass_list.append(line[1:])
                    filename_list.append(filename)
                    label_list.append(label)
                    c += 1
                count += 1
                if LIMIT and c > LIMIT:
                    break

print(len(notation_list), len(dna_list), len(mass_list), len(filename_list))

  0%|          | 0/12 [00:00<?, ?it/s]

36000 36000 36000 36000


In [14]:
data = pd.DataFrame({
    'notation': notation_list,
    'DNA': dna_list,
    'mass': mass_list,
    'filename': filename_list,
    'label': label_list
})
print(data.shape)
data.to_csv('../data/dna_data.csv', index=False)

(36000, 5)


In [15]:
data.head(5)

Unnamed: 0,notation,DNA,mass,filename,label
0,A00654:48:HN52TDRXY:1:2101:5990:1000 2:N:0:AGT...,ATATTTATGGCTGGACTTGAACTTACTAAGTAGACCATGCTGGCCT...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,GLDS-466_metagenomics_RR10_FCS_VIV_WT_V1_R2_HR...,Vivarium Control
1,A00654:48:HN52TDRXY:1:2101:6840:1000 2:N:0:AGT...,CCTACGCTCAGCGAGGCGACTTTGAGAGATGCGCCGAAGAATCTTT...,"FFFF:F,F:FFFFFFFFFFF,FFF::FFFFFFFFF,,F,,FFF,F:...",GLDS-466_metagenomics_RR10_FCS_VIV_WT_V1_R2_HR...,Vivarium Control
2,A00654:48:HN52TDRXY:1:2101:9498:1000 2:N:0:AGT...,GCCTTGACCCATGCCTGATAAGGGAGGGCCCGGTCGACGCCCAGGA...,":FFFFFFFFFFFFFFFF:FFFFFFFFFF,FF:FFFFF:FFFFFFFF...",GLDS-466_metagenomics_RR10_FCS_VIV_WT_V1_R2_HR...,Vivarium Control
3,A00654:48:HN52TDRXY:1:2101:15067:1000 2:N:0:AG...,GGACAGGGCCGCAGCATATTCTCATTAAACGGCTGGCCGTCATGGT...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,GLDS-466_metagenomics_RR10_FCS_VIV_WT_V1_R2_HR...,Vivarium Control
4,A00654:48:HN52TDRXY:1:2101:1298:1016 2:N:0:AGT...,TCCTCCATCCATGCTTTGAATGTCTGGAACCCTGCCTCTGTGTTAC...,FFFFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFF:F...,GLDS-466_metagenomics_RR10_FCS_VIV_WT_V1_R2_HR...,Vivarium Control


In [17]:
# Label proportion
from collections import Counter
my_list = list(data['label'])
element_count = Counter(my_list)
total_elements = len(my_list)
element_percentages = {key: (count / total_elements) * 100 for key, count in element_count.items()}
for key, percentage in element_percentages.items():
    print(f"{key}: {percentage:.2f}%")

Vivarium Control: 33.33%
Ground Control: 33.33%
Space Flight: 33.33%


In [None]:
# # Check
# count = 5
# with gzip.open('../data/GLDS-524_wgbs_GSM2684058_R1_raw.fastq.gz', 'rb') as file:
#     for line in file:
#         line = line.decode('utf-8')
#         print(line)
#         count -= 1
#         if count < 1:
#             break