This notebook aggregates dysfunctional T Cell data for lab 4.

This data was from the paper https://www.nature.com/articles/nature22367.
The data can be accessed at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE89309.

Data was first downloaded, and the following files were selected and unzipped for use:

Healthy T Cells:
-		
- GSM2365802_ATAC_E5_1_normalizedCounts.txt	
- GSM2365805_ATAC_E7_2_normalizedCounts.txt	
- GSM2365807_ATAC_M1_normalizedCounts.txt		

Dysfunctional T Cells (after LX days)
- GSM2365817_ATAC_L14_1_normalizedCounts.txt	
- GSM2365820_ATAC_L21_1_normalizedCounts.txt	
- GSM2365823_ATAC_L28_1_normalizedCounts.txt
- GSM2365826_ATAC_L35_1_normalizedCounts.txt
- GSM2365810_ATAC_L5_1_normalizedCounts.txt
- GSM2365829_ATAC_L60_1_normalizedCounts.txt
- GSM2365814_ATAC_L7_1_normalizedCounts.txt	

Dysfunctional Memory T Cells (After MLX days)
- GSM2365832_ATAC_ML7_1_normalizedCounts.txt
- GSM2365835_ATAC_ML14_1_normalizedCounts.txt
- GSM2365837_ATAC_ML35_1_normalizedCounts.txt



In [2]:
# imports
from datascience import Table
import numpy as np
import os
from os import listdir
from os.path import isfile, join

In [41]:
# get all txt file names
cwd = os.getcwd()

files = [f for f in listdir(cwd) if isfile(join(cwd, f))]
txtFiles = [f for f in files if f.endswith('.txt')]
txtFiles

['GSM2365799_ATAC_N1_normalizedCounts.txt',
 'GSM2365802_ATAC_E5_1_normalizedCounts.txt',
 'GSM2365805_ATAC_E7_2_normalizedCounts.txt',
 'GSM2365807_ATAC_M1_normalizedCounts.txt',
 'GSM2365810_ATAC_L5_1_normalizedCounts.txt',
 'GSM2365814_ATAC_L7_1_normalizedCounts.txt',
 'GSM2365817_ATAC_L14_1_normalizedCounts.txt',
 'GSM2365820_ATAC_L21_2_normalizedCounts.txt',
 'GSM2365823_ATAC_L28_1_normalizedCounts.txt',
 'GSM2365826_ATAC_L35_1_normalizedCounts.txt',
 'GSM2365829_ATAC_L60_1_normalizedCounts.txt',
 'GSM2365832_ATAC_ML7_2_normalizedCounts.txt',
 'GSM2365835_ATAC_ML14_1_normalizedCounts.txt',
 'GSM2365837_ATAC_ML35_1_normalizedCounts.txt']

In [42]:
# load in initial data from 1 file
table = Table.read_table(txtFiles[0])
table

chr,start,end,symbol,refseqID,peak_annotation,N1
chr1,9772732,9773228,1700034P13Rik,NR_040462,intergenic,209.153
chr1,9797805,9798616,Sgk3,NM_133220,promoter,538.113
chr1,9859046,9859750,Sgk3,NM_133220,intron,178.694
chr1,10089499,10089955,Cspp1,NM_026493,intron,251.796
chr1,12875425,12875883,Slco5a1,NM_172841,intron,185.801
chr1,13293331,13293833,Ncoa2,NM_008678,intron,305.607
chr1,20933432,20933875,Paqr8,NM_028829,intron,480.24
chr1,23255935,23256632,Mir30a,NR_029533,intergenic,681.271
chr1,23762738,23763139,B3gat2,NM_172124,promoter,102.546
chr1,33739113,33739764,Rab23,NM_008999,exon,248.75


In [43]:
# get start and end string points for parsing column name
end = len('_normalizedCounts.txt')
start = len('GSM2365802_ATAC_')

In [45]:
# for the rest of the files, iterate through and append them to the list

for i in range(1, len(txtFiles)):
    file = txtFiles[i]
    # get column name for data column
    columnName = file[start:len(file)-end]
    
    tmp = Table.read_table(file)
    
    # append to table
    table = table.with_column(columnName, tmp[columnName])

table

In [46]:
table

chr,start,end,symbol,refseqID,peak_annotation,N1,E5_1,E7_2,M1,L5_1,L7_1,L14_1,L21_2,L28_1,L35_1,L60_1,ML7_2,ML14_1,ML35_1
chr1,9772732,9773228,1700034P13Rik,NR_040462,intergenic,209.153,129.291,150.143,255.555,113.379,113.542,125.268,205.532,243.36,167.201,147.345,109.24,133.588,202.068
chr1,9797805,9798616,Sgk3,NM_133220,promoter,538.113,355.311,390.926,542.902,249.435,299.414,291.065,275.05,274.863,214.586,216.862,245.79,174.482,218.579
chr1,9859046,9859750,Sgk3,NM_133220,intron,178.694,702.002,671.84,666.4,231.992,432.299,257.905,384.616,224.458,286.34,163.213,294.473,196.293,366.395
chr1,10089499,10089955,Cspp1,NM_026493,intron,251.796,161.853,170.901,235.991,97.6808,117.747,127.11,161.705,191.38,150.955,148.101,116.364,124.046,224.083
chr1,12875425,12875883,Slco5a1,NM_172841,intron,185.801,131.206,143.916,176.076,125.59,203.534,198.034,281.095,353.621,326.278,272.777,118.739,140.404,283.052
chr1,13293331,13293833,Ncoa2,NM_008678,intron,305.607,203.035,282.989,289.792,172.686,264.089,183.297,315.854,367.009,373.663,334.738,212.543,164.94,247.671
chr1,20933432,20933875,Paqr8,NM_028829,intron,480.24,267.201,208.263,326.475,176.174,237.176,227.509,247.847,215.795,200.37,173.036,172.172,148.583,283.052
chr1,23255935,23256632,Mir30a,NR_029533,intergenic,681.271,398.408,455.965,627.272,378.513,555.934,233.036,205.532,165.39,158.401,281.845,298.035,184.025,158.038
chr1,23762738,23763139,B3gat2,NM_172124,promoter,102.546,168.557,152.219,397.394,231.992,300.255,136.321,144.326,129.162,111.693,151.879,163.86,95.4201,158.038
chr1,33739113,33739764,Rab23,NM_008999,exon,248.75,239.428,199.269,231.1,160.476,204.375,228.43,221.4,176.417,194.955,163.969,176.921,196.293,194.205


In [48]:
# remove underscore mouse labels from column labels
table = table.relabeled('E5_1', 'E5')
table = table.relabeled('E7_2', 'E7')
table = table.relabeled('L5_1', 'L5')
table = table.relabeled('L7_1', 'L7')
table = table.relabeled('L14_1', 'L14')
table = table.relabeled('L21_2', 'L21')
table = table.relabeled('L28_1', 'L28')
table = table.relabeled('L35_1', 'L35')
table = table.relabeled('L60_1', 'L60')
table = table.relabeled('ML7_2', 'ML7')
table = table.relabeled('ML14_1', 'ML14')
table = table.relabeled('ML35_1', 'ML35')

In [50]:
# save table as csv
saved_filename = 'dysfunctional_ATACseq_timeseries.csv'
table.to_csv('dysfunctional_ATACseq_timeseries.csv')

In [51]:
# test that you can load the table in
Table.read_table(saved_filename)