In [1]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt

file = 'data_set_ALL_AML_train.csv'
data = pd.read_csv(file, index_col=1)
key = pd.read_csv('actual.csv', index_col=0)

drop_list = ['call'] + ['call.' + str(x) for x in range(1,38)]
data = data.drop(columns=drop_list)

#Drop the 'Gene Description' row as it is not useful here
cleaned_data = data.drop(['Gene Description'], axis=1)
cleaned_data.columns = cleaned_data.columns.map(int)
print(cleaned_data.head())

#Transpose rows and columns so that each column is a different gene
transposed_data = cleaned_data.transpose()

#Merge the key dataframe with the trimmed dataframe to assign the cancer type to each patient sample
exp_data = pd.concat([key, transposed_data], axis=1, join='inner')

#Group by cancer type to aggregate and calculate useful statistical information with describe()
df_agg_trans = exp_data.groupby('cancer').describe()
print(df_agg_trans.head())

                        1    2    3    4    5    6    7    8    9    10 ...   \
Gene Accession Number                                                   ...    
AFFX-BioB-5_at        -214 -139  -76 -135 -106 -138  -72 -413    5  -88 ...    
AFFX-BioB-M_at        -153  -73  -49 -114 -125  -85 -144 -260 -127 -105 ...    
AFFX-BioB-3_at         -58   -1 -307  265  -76  215  238    7  106   42 ...    
AFFX-BioC-5_at          88  283  309   12  168   71   55   -2  268  219 ...    
AFFX-BioC-3_at        -295 -264 -376 -419 -230 -272 -399 -541 -210 -178 ...    

                        35   36   37   38   28   29   30   31   32   33  
Gene Accession Number                                                    
AFFX-BioB-5_at           7 -213  -25  -72   -4   15 -318  -32 -124 -135  
AFFX-BioB-M_at        -100 -252  -20 -139 -116 -114 -192  -49  -79 -186  
AFFX-BioB-3_at         -57  136  124   -1 -125    2  -95   49  -37  -70  
AFFX-BioC-5_at         132  318  325  392  241  193  312  230  330  3

In [2]:
#Transpose dataframe so that genes form the rows
df_agg = df_agg_trans.transpose()

#Extract 25% percentile
per_25 = df_agg.loc[(slice(None), '25%'), :]
per_25 = per_25.reset_index(level=1)
per_25.drop(columns='level_1', inplace=True)
per_25.columns = ['ALL_25%', 'AML_25%']

#Extract 75% percentile
per_75 = df_agg.loc[(slice(None), '75%'), :]
per_75 = per_75.reset_index(level=1)
per_75.drop(columns='level_1', inplace=True)
per_75.columns = ['ALL_75%', 'AML_75%']

IQR_df = pd.concat([per_25, per_75], axis=1)
IQR_df['ALL_IQR'] = IQR_df['ALL_75%'] - IQR_df['ALL_25%']
IQR_df['AML_IQR'] = IQR_df['AML_75%'] - IQR_df['AML_25%']
#print(IQR_df.head(10))
#print((IQR_df.loc['A28102_at', 'ALL_25%'] - 1.5*IQR_df.loc['A28102_at', 'ALL_IQR']))
#print((IQR_df.loc['A28102_at', 'ALL_75%'] + 1.5*IQR_df.loc['A28102_at', 'ALL_IQR']))
#print((IQR_df.loc['A28102_at', 'AML_25%'] - 1.5*IQR_df.loc['A28102_at', 'AML_IQR']))
#print((IQR_df.loc['A28102_at', 'AML_75%'] + 1.5*IQR_df.loc['A28102_at', 'AML_IQR']))

#Extract each gene one at a time and check for outliers
def outliers(df):
    outliers = pd.DataFrame(index=df.index)
    for index, row in df.iterrows():
        low_outlier_ALL = IQR_df.loc[index, 'ALL_25%'] - 3*(IQR_df.loc[index, 'ALL_IQR'])
        low_outlier_AML = IQR_df.loc[index, 'AML_25%'] - 3*(IQR_df.loc[index, 'AML_IQR'])
        high_outlier_ALL = IQR_df.loc[index, 'ALL_75%'] + 3*(IQR_df.loc[index, 'ALL_IQR'])
        high_outlier_AML = IQR_df.loc[index, 'AML_75%'] + 3*(IQR_df.loc[index, 'AML_IQR'])
        for row_index, value in row.iteritems():
            if ((row_index >= 1) & (row_index < 28)):
                if((value < low_outlier_ALL) | (value > high_outlier_ALL)):
                    outliers.loc[index, row_index] = value
            else:
                if((value < low_outlier_AML) | (value > high_outlier_AML)):
                    outliers.loc[index, row_index]= value
    return outliers

outliers_all = outliers(cleaned_data)

#test = cleaned_data.loc[['A28102_at']]
#print(test)
#test.loc['A28102_at', 1] = -314
#print(test)
#outliers(test)

In [3]:
top_genes_list = ['AF009426_at', 'D49950_at', 'L08246_at', 'L13278_at', 'L47738_at', 'M11147_at', 'M16038_at', 'M21551_rna1_at', 'M23197_at', 'M27891_at', 'M55150_at', 'M57710_at', 'M62762_at', 'M63138_at', 'M69043_at', 'M80254_at', 'M81933_at', 'M84526_at', 'M91432_at', 'M92287_at', 'M96326_rna1_at', 'S50223_at', 'U05259_rna1_at', 'U12471_cds1_at', 'U32944_at', 'U46751_at', 'U50136_rna1_at', 'U82759_at', 'X04085_rna1_at', 'X15949_at', 'X17042_at', 'X52142_at', 'X59417_at', 'X63469_at', 'X74262_at', 'X95735_at', 'Y12670_at', 'D38073_at', 'Z15115_at', 'D26156_s_at', 'U22376_cds2_s_at', 'M28130_rna1_s_at', 'Y00787_s_at', 'M31211_s_at', 'M81695_s_at', 'M83652_s_at', 'U09087_s_at', 'X85116_rna1_s_at', 'X58431_rna2_s_at', 'J03801_f_at', 'M19045_f_at', 'X14008_rna1_f_at', 'M31523_at', 'M28170_at', 'U29175_at']
top_genes = cleaned_data.loc[top_genes_list]
print(top_genes.head())
top_outliers = outliers(top_genes)
top_outliers = top_outliers.sort_index(axis=1)
print(top_outliers.info())

                        1     2     3    4     5     6    7     8     9   \
Gene Accession Number                                                      
AF009426_at             36    58    63   38   120    92   16   169    43   
D49950_at               75   129    44  218   110    33  115    32     9   
L08246_at              543  2972   485  740  2453   375  961   404   783   
L13278_at              193    31   198   91   194    96  178   112   198   
L47738_at              571  2893  2723  731   649  1858  280  1716  2388   

                         10  ...    35    36    37    38    28    29    30  \
Gene Accession Number        ...                                             
AF009426_at             -18  ...   -67   -43   -45   -42   -44   -21   -33   
D49950_at                54  ...   295   283   311    70   297   190   326   
L08246_at              1288  ...   853  6159  5503  2083  6718  1766  4915   
L13278_at                 9  ...    -7    22   -32   -16    16   -17   -16   