## Create Individual Count Files For Galaxy DESeq2 Input

In [19]:
# Confirm file location
!ls ../GLDS-202_Unnormalized_Counts.txt

../GLDS-202_Unnormalized_Counts.txt


In [20]:
# Create Output Directory
!mkdir -p ../IndividualCounts

In [21]:
# Extract Gene ID Column
!cut -f 1 ../GLDS-202_Unnormalized_Counts.txt > GeneID_Column.txt

# Manually added "GeneID" Column Header

In [22]:
# Confirm Extraction
!head GeneID_Column.txt


ENSMUSG00000000001.4
ENSMUSG00000000003.15
ENSMUSG00000000028.15
ENSMUSG00000000031.16
ENSMUSG00000000037.16
ENSMUSG00000000049.11
ENSMUSG00000000056.7
ENSMUSG00000000058.6
ENSMUSG00000000078.7


In [23]:
# Split columns in counts file
# Output will be the gene id column and a single count column
# Perform for each count column

filename = "../GLDS-202_Unnormalized_Counts.txt"
inputfile = open(filename, "r")

# Get column IDs to name new files
num_samples = 0
samples = []
for line in inputfile: 
    
    fields = line.strip().split()
    num_samples = len(fields)
    samples = fields
    break

i = 2
while i < num_samples+2: 

    !cut -f $i $filename > tempfile
    
    outputfile = "../IndividualCounts/" + fields[i-2]
    
    !paste GeneID_Column.txt tempfile > $outputfile
    
    i += 1

In [24]:
# DESeq2 needs integers. There are decimals in the read counts. 
# Round to the nearest integer. 

import glob
all_files = glob.glob("../IndividualCounts/CFG*")

for filex in all_files:
    
    inputfile = open(filex, "r")
    outputfile = open(filex + ".rounded", "w")
 
    first_line = True
    for line in inputfile:
        
        # Skip Header Line
        if first_line == True:
            first_line = False
            outputfile.write("GeneID" + "\t" + filex.split("IndividualCounts/")[1] + "\n")
            continue
        
        fields = line.strip().split()
        
        gene_id = fields[0]
        read_count = int(round(float(fields[1])))
        
        outputfile.write(gene_id + "\t" + str(read_count) + "\n")
    
    inputfile.close()
    outputfile.close()
    

In [25]:
# Create a directory for new files
!mkdir -p ../IndividualCounts/DESeq2_input

In [26]:
# Move files into that directory
!mv ../IndividualCounts/*.rounded ../IndividualCounts/DESeq2_input/

In [27]:
# Rename files with grouping info

inputfile = open("../GLDS-202_203/GLDS-202_SampleTable.csv", "r")

header = True
for line in inputfile: 
    
    # Skip Header Line
    if header == True: 
        header = False
        continue
    
    fields = line.strip().split(",")
    
    sample_name = fields[0]
    radiation = fields[1]
    unloaded = fields[2]
    tissue = fields[3]
    timepoint = fields[4]
    
    if tissue == "Retina":
        
        condition = ""
        if radiation == "0.4gray" and unloaded == "Unloaded":
            condition = "Combination."
        elif radiation == "0.4gray":
            condition = "Radiation."
        elif unloaded == "Unloaded": 
            condition = "Unloaded."
        else:
            condition = "Control."
        
        duration=""
        if timepoint == "7day":
            duration = "7days."
        elif timepoint == "1month":
            duration = "1month."
        elif timepoint == "4month":
            duration = "4month."
        
        filepath = "../IndividualCounts/DESeq2_input/"
        old_filename = sample_name + ".rounded"
        new_filename = condition + duration + sample_name
        
        !mv ../IndividualCounts/DESeq2_input/$old_filename ../IndividualCounts/DESeq2_input/$new_filename
        
inputfile.close()    
    