In [9]:
#!/usr/bin/python3
# First, lets set up a dictionary mapping each cell to it's cluster
# assuming the default value of k.  This will greatly speed up the
# next step when we're creating the averages.

clusterMap = {}
cellList = None
selectedK = None
for line in open("/Users/student/Documents/fundamentals/programming/data/E-MTAB-7365.clusters.csv"):
    # If this is the first line, we need to get
    # the list of cells.

    line = line.rstrip() # Strip off the training newline
    if len(line) <= 1:
        continue  # Blank line?

    if line.startswith('#'):
        continue # support comment lines

    if cellList is None:
        # This is the first line
        cellList = line.split(',') [2:] # Skip over the first two fields, 
        continue

    (sel, k, rest) = line.split(',',2)
    if sel == 'True':
        selectedK = int(k)
        clusters = rest.split(',')
        for index in range(0,len(cellList)): ## iterate over cells, append them to the right cluster.
            cluster = int(clusters[index]) # We want to make sure this is an int
            if not cluster in clusterMap: ## useful, key lookup, fast
                clusterMap[cluster] = []
            clusterMap[cluster].append(cellList[index])


In [11]:
# Now, we have two maps, one indexed by the cluster number (we'll use that to
# calculate the averages), and one indexed by the cell name (we'll use that to
# figure out which cells have assigned values)

geneMap = {}
cellColumnMap = None
for line in open("/Users/student/Documents/fundamentals/programming/data/E-MTAB-7365.csv"):

    # Note: when you see repetitive code like this, we might want
    # to be thinking about a function
    line = line.rstrip() # Strip off the trailing newline
    if len(line) <= 1:
        continue  # Blank line?

    if line.startswith('#'):
        continue # support comment lines

    if cellColumnMap is None:
        cellColumnMap = {}
        # This is the first line
        cellColumns = line.split(',') [1:] # Skip over the first field
        for column in range(0, len(cellColumns)):
            cellColumnMap[cellColumns[column]] = column
        continue

    values = line.split(",")  # Get all of the data values

    geneMap[values[0]] = [] # Create the map entry
    average = 0.0
    for cluster in range(1,selectedK+1):
        cellList = clusterMap[cluster]
        for cell in cellList:

            # NOTE: this is not efficient
            if not cell in cellColumnMap: ## not in vs not .. in ..
                continue # Not all cells are assigned to clusters

            column = cellColumnMap[cell]
            # Handle missing values (very common in scRNASeq)
            if len(values[column+1]) == 0:
                continue

            average += float(values[column+1])

        average = average / len(cellList)
        geneMap[values[0]].append(average)



In [13]:
# OK, all done.  Our answers are in geneMap
header = "Gene"
for cluster in range(1,selectedK):
    header += "\tCluster "+str(cluster)
print (header)
for gene in geneMap.keys():
    averages = geneMap[gene]
    line = gene
    for average in averages:
        line += '\t'+str(average)
    print (line)

Gene	Cluster 1	Cluster 2	Cluster 3	Cluster 4
"ENSMUSG00000000001"	103.51885492971431	50.763067957990486	91.08202141631963	356.5120091244894	54.57341202817781
"ENSMUSG00000000028"	382.48543147828576	237.35525061394284	197.90897338455773	391.02470902262104	404.63956685875866
"ENSMUSG00000000037"	0.0	11.213994333333334	0.44855977333333336	4.304924398431373	2.5732065498879555
"ENSMUSG00000000056"	1.5881816708571428	26.84962111569524	23.76577754182781	2.6400810306957534	10.899896002192554
"ENSMUSG00000000058"	82.31163145257143	13.510415124419048	12.390687636576763	0.8073757897986331	0.6541501992713309
"ENSMUSG00000000078"	385.96201426914286	378.6979016693047	284.49486819077214	477.5515150523984	94.60637334231417
"ENSMUSG00000000085"	2.872413458285714	10.51513179327706	32.53303408653108	17.651480479795943	5.256864387128282
"ENSMUSG00000000088"	44.304915488000006	115.65323067793334	104.29171612271733	43.349245621336316	28.684526840095447
"ENSMUSG00000000093"	1.5809565714285714	0.1365165190476

"ENSMUSG00000024037"	0.019721522571428573	3.368201184752381	21.744853518190098	7.55948703048177	4.580817145034412
"ENSMUSG00000024038"	84.10491576542857	141.12377143718095	165.22254102148725	184.84788743891104	68.37454240277937
"ENSMUSG00000024042"	9.102261572000003	2.6443761600666664	37.174464930402664	10.487671819435452	3.8094797842453896
"ENSMUSG00000024043"	24.64942056914286	0.8439707716380952	7.027087830865525	38.814599331227384	3.347461737944813
"ENSMUSG00000024044"	21.643797995999996	9.6457611512	47.51213318795969	16.690973969291743	2.4962192435208386
"ENSMUSG00000024045"	68.60824259885715	58.94537176562858	30.930322104225148	112.20101766083677	23.08893311148834
"ENSMUSG00000024048"	405.23026454057145	719.0461496880191	929.4791409235208	664.3937818190307	245.6528898227879
"ENSMUSG00000024050"	7.905164860857142	0.468046236695238	6.61633874186781	0.5586058259922241	0.36824148757087316
"ENSMUSG00000024052"	9.279553723714285	74.66540092979047	19.45467759599162	80.12087522035243	14.3

"ENSMUSG00000032657"	5.018402687428572	12.038856119580952	13.248701388783239	0.8304003546343082	0.059314311045307724
"ENSMUSG00000032661"	1.8245790514285714	0.21073300471428572	0.09163241418857142	0.026680934952268906	0.054117281068019205
"ENSMUSG00000032666"	15.611529248000002	31.637980285599998	58.948270475423996	6.647286083848471	18.244906455989174
"ENSMUSG00000032667"	100.93721342342857	220.07784887744762	122.03717587269792	66.60594789898222	31.366003730641584
"ENSMUSG00000032673"	6.150292080857142	30.874744874028572	35.60991557936114	57.59438123149183	6.839825623677989
"ENSMUSG00000032679"	5.9195818085714285	26.47209131861905	29.917427496744764	1.8244466351026334	2.7199815503644738
"ENSMUSG00000032688"	3.0934128742857148	1.7343592771428573	1.3752363010857145	21.392804600063865	1.7016776285759903
"ENSMUSG00000032698"	1.6889848685714286	9.149384306619048	67.94454086146476	7.357277885968516	2.920227788997751
"ENSMUSG00000032702"	1.4124206434285713	10.121671214780951	15.49269984859124

"ENSMUSG00000045671"	47.51988258571429	31.573592382190476	1.5396780576876188	37.394788386922805	51.543369084780196
"ENSMUSG00000045672"	26.827987958571427	15.688719225285714	14.641428239011429	47.50815437641244	80.43593817402947
"ENSMUSG00000045678"	2.4895194791428565	2.7318588219714286	1.495488850878857	5.240154077698756	8.596550791264196
"ENSMUSG00000045679"	212.24949168428563	148.93566656614288	212.77667434464573	478.17498617086153	81.29834262934726
"ENSMUSG00000045680"	340.29866768314275	287.4476953917714	401.5342595236709	230.24407274845123	201.74010683703224
"ENSMUSG00000045690"	24.781007155428576	3.8238047071809533	0.6001130706872381	1.2371924665110139	2.5742979761793583
"ENSMUSG00000045691"	0.021166477142857142	0.024165789238095238	10.120099431569523	0.5952999665629131	0.26759146189735095
"ENSMUSG00000045708"	0.15713730114285715	0.06804741003809524	0.0027218964015238096	0.00016011155303081233	7.461005378791515e-05
"ENSMUSG00000045725"	1.8327699782857143	0.16165067594285715	0.56

"ENSMUSG00000074978"	0.8075575574689429	1.000571757582298	0.5714279419032919	0.960902327170782	0.8418894733693415
"ENSMUSG00000074981"	0.16299304857142857	0.33365038161904764	0.013346015264761906	0.0007850597214565827	0.031090861408675473
"ENSMUSG00000074994"	18.394569937142858	26.10944376457143	9.542170388182857	22.47729988754017	16.207607849110012
"ENSMUSG00000074995"	0.1598702	0.19306004000000002	0.0077224016000000005	0.01657171068235294	0.05250481290588236
"ENSMUSG00000075000"	2.3143822885714287	20.34945740961905	6.651026445584762	0.3912368497402801	3.935546203552877
"ENSMUSG00000075014"	80.31394540000001	59.46681900033333	30.186222711613333	55.01722652421254	157.24066009458656
"ENSMUSG00000075015"	5.353305371714285	2.6445579307238094	1.1627744484289524	2.7802401940252324	14.62153428171609
"ENSMUSG00000075028"	2.0507481917142854	5.797947340390476	1.0138606216156192	5.231631620095037	3.4442536371496457
"ENSMUSG00000075033"	0.009723302285714285	0.0003241100761904762	12.19386296440304

"ENSMUSG00000116118"	0.06739548548285486	0.03325937818276183	0.08972467112731047	0.029615460654547676	0.0021153900467534054
"ENSMUSG00000116127"	0.05638811828571428	0.023782637276190476	0.022479501491047618	0.001322323617120448	9.445168693717486e-05
"ENSMUSG00000116136"	3.1439254857142855	0.10479751619047618	0.004191900647619047	0.04201173650868347	0.0030008383220488195
"ENSMUSG00000116168"	0.0	0.04036203333333333	0.3475377929333333	0.0365608513490196	0.002611489382072829
"ENSMUSG00000116174"	18.37992495914286	0.6386056999714287	0.02554422799885715	0.03313548811757983	0.6121608920083986
"ENSMUSG00000116184"	1.089771830857143	4.624134487695239	0.4738941035078096	2.002944551971048	1.4189976444265036
"ENSMUSG00000116234"	0.32086451314285713	0.01069548377142857	0.00042781935085714284	2.5165844168067226e-05	1.7975602977190876e-06
"ENSMUSG00000116242"	0.2955108022857143	0.18975865007619047	0.007590346003047619	0.0004464909413557423	0.019250988638668264
"ENSMUSG00000116244"	0.3960361697142857

In [15]:
## Function verision

def main():
    clusterMap, cellList, selectedK = readClusters("E-MTAB-7365.clusters.csv")
    genes = readGenes("E-MTAB-7365.csv", clusterMap, cellList, selectedK)
    printAverages(genes, selectedK)


def readClusters(filename):
    # First, lets set up a dictionary mapping each cell to it's cluster
    # assuming the default value of k.  This will greatly speed up the
    # next step when we're creating the averages.
    clusterMap = {}
    cellList = None
    selectedK = None
    with open(filename) as f:
        for line in f:
            # If this is the first line, we need to get
            # the list of cells.

            line = line.rstrip() # Strip off the training newline
            if len(line) <= 1 or line.startswith('#'):
                continue  # Skip blank and comment lines

            if cellList is None:
                # This is the first line
                cellList = getCellList(line)
                continue

            (sel, k, rest) = line.split(',',2)
            if sel == 'True':
                selectedK = int(k)
                clusters = rest.split(',')
                for index in range(0,len(cellList)):
                    cluster = int(clusters[index]) # We want to make sure this is an int
                    if not cluster in clusterMap:
                        clusterMap[cluster] = []
                    clusterMap[cluster].append(cellList[index])
    return clusterMap, cellList, selectedK


def getCellList(line):
    return line.split(',') [2:] # Skip over the first two fields


def readGenes(filename, clusterMap, cellList, selectedK):
    # Now, we have two maps, one indexed by the cluster number (we'll use that to
    # calculate the averages), and one indexed by the cell name (we'll use that to
    # figure out which cells have assigned values)
    geneMap = {}
    cellColumnMap = None
    for line in open(filename):

        # Note: when you see repetitive code like this, we might want
        # to be thinking about a function
        line = line.rstrip() # Strip off the trailing newline
        if len(line) <= 1 or line.startswith('#'):
            continue  # Skip blank and comment lines

        if cellColumnMap is None:
            # This is the first line
            cellColumnMap = getColumnMap(line) ## map of gene names to column number
            continue

        values = line.split(",")  # Get all of the data values

        geneMap[values[0]] = [] # Create the map entry
        average = 0.0
        for cluster in range(1,selectedK+1):
            cellList = clusterMap[cluster]
            for cell in cellList:

                # NOTE: this is not efficient
                if not cell in cellColumnMap:
                    continue # Not all cells are assigned to clusters

                column = cellColumnMap[cell]
                # Handle missing values (very common in scRNASeq)
                if len(values[column+1]) == 0:
                    continue

                average += float(values[column+1])
                # Do we need to maintain a count of non-missing values?

            average = average / len(cellList)
            geneMap[values[0]].append(average)
    return geneMap


def getColumnMap(line):
    columnMap = {}
    columns = line.split(',') [1:] # Skip over the first field
    for column, columnName in enumerate(columns):
        columnMap[columnName] = column
    return columnMap


def printAverages(geneMap, selectedK):
    # OK, all done.  Our answers are in geneMap
    header = "Gene"
    for cluster in range(1,selectedK+1):
        header += "\tCluster "+str(cluster)
    print(header)
    for gene in geneMap.keys():
        averages = geneMap[gene]
        line = gene
        for average in averages:
            line += '\t'+str(average)
        print(line)


if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'E-MTAB-7365.clusters.csv'