In [1]:
import pandas as pd
import numpy as np
import json

file = 'data_set_ALL_AML_train.csv'
df = pd.read_csv(file, index_col=1)

In [2]:
#print(df.loc[1240:1250,'Gene Description':'Gene Accession Number'])
print(df.info())
print(df.shape)
a = ['call'] + ['call.' + str(x) for x in range(1,38)]
data = df.drop(columns=a)
print(data.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 7129 entries, AFFX-BioB-5_at to Z78285_f_at
Data columns (total 77 columns):
Gene Description    7129 non-null object
1                   7129 non-null int64
call                7129 non-null object
2                   7129 non-null int64
call.1              7129 non-null object
3                   7129 non-null int64
call.2              7129 non-null object
4                   7129 non-null int64
call.3              7129 non-null object
5                   7129 non-null int64
call.4              7129 non-null object
6                   7129 non-null int64
call.5              7129 non-null object
7                   7129 non-null int64
call.6              7129 non-null object
8                   7129 non-null int64
call.7              7129 non-null object
9                   7129 non-null int64
call.8              7129 non-null object
10                  7129 non-null int64
call.9              7129 non-null object
11                  7129 no

In [3]:
key = pd.read_csv('actual.csv', index_col=0)
print(key.head())
print(type(key.index))

        cancer
patient       
1          ALL
2          ALL
3          ALL
4          ALL
5          ALL
<class 'pandas.core.indexes.numeric.Int64Index'>


I want to merge the gene expression dataframe ('data') with the key to create a dataframe that includes information on the type of cancer. I can then groupby the 'cancer' column to perform aggregations such as determining the mean and standard deviation of the expression level for each gene in the dataset.

In order to merge the 'data' and 'key' dataframes, however, I first need to reshape the data.

In [4]:
#Transpose rows and columns so that each column is a different gene
transposed_data = data.transpose()
#Drop the 'Gene Description' row as it is not useful here
cleaned_data = transposed_data.drop(['Gene Description'])
cleaned_data.index = cleaned_data.index.map(int)
print(cleaned_data.info())
#Convert columns to numeric datatypes instead of objects
cleaned_data = cleaned_data.apply(pd.to_numeric, errors='coerce')
#print first column of data as a check
print(cleaned_data.iloc[:, 0:1])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38 entries, 1 to 33
Columns: 7129 entries, AFFX-BioB-5_at to Z78285_f_at
dtypes: object(7129)
memory usage: 2.1+ MB
None
Gene Accession Number  AFFX-BioB-5_at
1                                -214
2                                -139
3                                 -76
4                                -135
5                                -106
6                                -138
7                                 -72
8                                -413
9                                   5
10                                -88
11                               -165
12                                -67
13                                -92
14                               -113
15                               -107
16                               -117
17                               -476
18                                -81
19                                -44
20                                 17
21                             

In [5]:
#Merge the key dataframe with the trimmed dataframe to assign the cancer type to each patient sample
exp_data = pd.concat([key, cleaned_data], axis=1, join='inner')
#Check that output has the expected shape
print(exp_data.shape)
#Check against previous printout to confirm data is intact
print(exp_data.iloc[:, 0:2])
#Group by cancer type to aggregate and calculate useful statistical information with describe()
stats = exp_data.groupby('cancer').describe()
print(stats)

(38, 7130)
   cancer  AFFX-BioB-5_at
1     ALL            -214
2     ALL            -139
3     ALL             -76
4     ALL            -135
5     ALL            -106
6     ALL            -138
7     ALL             -72
8     ALL            -413
9     ALL               5
10    ALL             -88
11    ALL            -165
12    ALL             -67
13    ALL             -92
14    ALL            -113
15    ALL            -107
16    ALL            -117
17    ALL            -476
18    ALL             -81
19    ALL             -44
20    ALL              17
21    ALL            -144
22    ALL            -247
23    ALL             -74
24    ALL            -120
25    ALL             -81
26    ALL            -112
27    ALL            -273
28    AML              -4
29    AML              15
30    AML            -318
31    AML             -32
32    AML            -124
33    AML            -135
34    AML             -20
35    AML               7
36    AML            -213
37    AML             -25
3

In [11]:
gene_ratios = {}
ratios = []
for key, value in stats.iteritems():
    if key[1] == 'mean':
        if value['AML'] != 0:
            ratio = value['ALL'] / value['AML']
            gene_ratios[key[0]] = ratio
            ratios.append(ratio)
        else:
            print('Gene:' + key[0] + ' ALL mean: ' + str(value['ALL']) + ' AML mean: ' + str(value['AML']))

print(ratios)

#for gene, ratio in gene_ratios.items():


Gene:D78333_at ALL mean: 7.962962962962963 AML mean: 0.0
Gene:HG3748-HT4018_at ALL mean: 194.74074074074073 AML mean: 0.0
[0.8179784616367878, 6.615809839690437, 0.9773134958320144, 0.8527551942186088, 0.8013569124680235, 0.13727858293075684, 0.6985223565534446, 2.3206043588715066, 1.1291486291486292, 0.9867291880328326, 0.7933968253968254, 1.1016753026861, 0.7271489016402944, 0.7471814352156233, 2.188415238250343, -0.9324695065435807, 1.0368050722956776, 0.8417409515261545, 1.1736612773764477, 0.7977166747065445, 1.0738095238095238, 1.08090902999148, 0.9403293944385727, -0.386944021647218, 0.6731078904991948, 0.9659204778721459, 0.6087020002920134, 0.8117719553281518, 1.8388092393468736, -1.175438596491228, 1.11238198983297, 0.8894210835731303, -0.48148148148148145, 1.0334497371494746, 0.6151199565142194, 1.0189277264407837, 1.98978469275499, 1.0414888246362843, 2.1451459992415622, 1.0190758191568812, 1.0646728500220835, 0.9804489428297033, 0.8443596268023749, 0.9036309924132478, 1.22

In [7]:
#Transpose dataframe so that genes form the rows
df = stats.transpose()
print(df.head(8))
print(df.shape)
print(df.info())

cancer                  ALL         AML
A28102_at count   27.000000   11.000000
          mean   258.555556  316.090909
          std    203.127723  126.132830
          min     20.000000   50.000000
          25%    112.500000  254.500000
          50%    216.000000  324.000000
          75%    385.500000  385.000000
          max    872.000000  486.000000
(57032, 2)
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 57032 entries, (A28102_at, count) to (hum_alu_at, max)
Data columns (total 2 columns):
ALL    57032 non-null float64
AML    57032 non-null float64
dtypes: float64(2)
memory usage: 3.9+ MB
None


In [8]:
#s_unstack = df.unstack(level=1)
#print(s_unstack.head())
for key, value in df.iteritems():
    print(key)

ALL
AML


In [9]:
#Extract stats info from the MultiIndex and make a separate column
df.reset_index(level=1, inplace=True)
df.rename(columns={'level_1': 'stats'}, inplace=True)
#Rename the column and row indices for clarity
df.rename_axis('Genes', inplace=True)
df.rename_axis(None, axis=1, inplace=True)
print(df.head(16))
print(df.info())
print(df.index)
print(df.columns)

             stats         ALL         AML
Genes                                     
A28102_at    count   27.000000   11.000000
A28102_at     mean  258.555556  316.090909
A28102_at      std  203.127723  126.132830
A28102_at      min   20.000000   50.000000
A28102_at      25%  112.500000  254.500000
A28102_at      50%  216.000000  324.000000
A28102_at      75%  385.500000  385.000000
A28102_at      max  872.000000  486.000000
AB000114_at  count   27.000000   11.000000
AB000114_at   mean   40.296296    6.090909
AB000114_at    std   57.974673   31.007917
AB000114_at    min -111.000000  -35.000000
AB000114_at    25%   17.000000  -20.000000
AB000114_at    50%   41.000000    1.000000
AB000114_at    75%   70.000000   36.000000
AB000114_at    max  174.000000   56.000000
<class 'pandas.core.frame.DataFrame'>
Index: 57032 entries, A28102_at to hum_alu_at
Data columns (total 3 columns):
stats    57032 non-null object
ALL      57032 non-null float64
AML      57032 non-null float64
dtypes: float64

In [10]:
#normalized = pd.Series(data = (stats.loc['ALL'] / stats.loc['AML']), name='ALL/AML ratio')
#stats = stats.append(normalized)
#print(stats)

I need a dataframe where each row is a gene and the columns are ALL mean, ALL st.dev, AML mean, ALL st.dev., and ALL/AML ratio. How to do this?