In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train_file = 'ALL_AML_grow.train.orig.txt'
test_file = 'ALL_AML_grow.test.orig.txt'
sample_file = 'table_ALL_AML_samples.txt'
train_idclass_file = 'ALL_AML_idclass.train.txt'
test_idclass_file = 'ALL_AML_idclass.test.txt'
train_tmp_file = 'LL_AML_grow.train.noaffy.tmp'
test_tmp_file = 'ALL_AML_grow.test.noaffy.tmp'
norm_train_file = 'ALL_AML_grow.train.norm.tmp'
norm_test_file = 'ALL_AML_grow.test.norm.tmp'
gcol_train_file = 'ALL_AML_gcol.train.tmp'
gcol_test_file = 'ALL_AML_gcol.test.tmp'
removable_word = 'endogenous control'

## Microarray Data Cleaning Steps

Remove Control from Gene Description and Replacing Commas with semicolon :

In [24]:
def remove_rows(file_path,tmp_path,removable):
    data = pd.read_csv(file_path, sep='\t',index_col=False)
    selected_rows = [g.find(removable) == -1 for g in data['Gene Description']]
    print('Number of occurrences of {} in {} : {}'.format(removable,file_path,np.sum(np.array(selected_rows) == False)))
    data = data[selected_rows]
    data.to_csv(tmp_path,index=False)
remove_rows(train_file,train_tmp_file,removable_word)
remove_rows(test_file,test_tmp_file,removable_word)

Number of occurrences of endogenous control in ALL_AML_grow.train.orig.txt : 58
Number of occurrences of endogenous control in ALL_AML_grow.test.orig.txt : 58


Remove unnecessary columns and renaming "Gene Accession Number" column to "ID" :

In [25]:
def remove_unnecessary_columns(file_path) :
    data = pd.read_csv(file_path)
    data = data[np.append(data.columns[1],data.columns[2::2])]
    data.columns = np.append(['ID'],data.columns[1:])
    return data
train_data = remove_unnecessary_columns(train_tmp_file)
test_data = remove_unnecessary_columns(test_tmp_file)

Limiting Attributes Value, between 20 and 16000 :

In [26]:
def normalize_data(data,save_path) :
    data = data.applymap(lambda x:x if type(x) == str or (x > 20 and x < 16000) else None)
    data.to_csv(save_path,index=False)
normalize_data(train_data,norm_train_file)
normalize_data(test_data,norm_test_file)

Transposing Matrix :

In [27]:
def tansposing_matrix(data_path,save_path) :
    data = pd.read_csv(data_path).transpose()
    print('Shape of {} : {}'.format(data_path,data.shape))
    data.to_csv(save_path)
tansposing_matrix(norm_train_file,gcol_train_file)
tansposing_matrix(norm_test_file,gcol_test_file)

Shape of ALL_AML_grow.train.norm.tmp : (39, 7071)
Shape of ALL_AML_grow.test.norm.tmp : (35, 7071)


Extract from file table_ALL_AML_samples.txt tables 
ALL_AML_idclass.train.txt and ALL_AML_idclass.test.txt with sample id and sample labels, space separated.

In [28]:
pd.read_csv(gcol_train_file)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,7061,7062,7063,7064,7065,7066,7067,7068,7069,7070
0,ID,hum_alu_at,A28102_at,AB000114_at,AB000115_at,AB000220_at,AB000409_at,AB000449_at,AB000450_at,AB000460_at,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
1,1,15091.0,151.0,72.0,281.0,36.0,,57.0,186.0,1647.0,...,185.0,511.0,,389.0,,793.0,329.0,36.0,191.0,
2,2,11038.0,263.0,21.0,250.0,43.0,,169.0,219.0,2043.0,...,169.0,837.0,,442.0,,782.0,295.0,,76.0,
3,3,,88.0,,358.0,42.0,142.0,359.0,237.0,1997.0,...,315.0,1199.0,33.0,168.0,52.0,1138.0,777.0,41.0,228.0,
4,4,15763.0,484.0,61.0,118.0,39.0,,274.0,245.0,2128.0,...,240.0,835.0,218.0,174.0,,627.0,170.0,,126.0,
5,5,,118.0,,197.0,39.0,237.0,311.0,186.0,1608.0,...,156.0,649.0,57.0,504.0,,250.0,314.0,,56.0,
6,6,,270.0,85.0,71.0,32.0,,232.0,30.0,1354.0,...,115.0,1221.0,,172.0,,645.0,341.0,26.0,193.0,
7,7,,458.0,,168.0,,87.0,131.0,199.0,1784.0,...,30.0,819.0,,151.0,,1140.0,482.0,,369.0,
8,8,,872.0,25.0,296.0,59.0,,70.0,556.0,2911.0,...,289.0,629.0,,302.0,23.0,1799.0,446.0,59.0,781.0,
9,9,15272.0,62.0,,198.0,27.0,148.0,313.0,259.0,2117.0,...,356.0,980.0,,177.0,,758.0,385.0,115.0,244.0,
