In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
train_file = 'ALL_AML_grow.train.orig.txt'
test_file = 'ALL_AML_grow.test.orig.txt'
sample_file = 'table_ALL_AML_samples.txt'
train_idclass_file = 'ALL_AML_idclass.train.txt'
test_idclass_file = 'ALL_AML_idclass.test.txt'
train_tmp_file = 'LL_AML_grow.train.noaffy.tmp'
test_tmp_file = 'ALL_AML_grow.test.noaffy.tmp'
norm_train_file = 'ALL_AML_grow.train.norm.tmp'
norm_test_file = 'ALL_AML_grow.test.norm.tmp'
gcol_train_file = 'ALL_AML_gcol.train.tmp'
gcol_test_file = 'ALL_AML_gcol.test.tmp'
gcol_class_test_file = 'ALL_AML_gcol_class.test.csv'
gcol_class_train_file = 'ALL_AML_gcol_class.train.csv'
removable_word = 'endogenous control'

## Microarray Data Cleaning Steps

Remove Control from Gene Description and Replacing Commas with semicolon :

In [3]:
def remove_rows(file_path,tmp_path,removable):
    data = pd.read_csv(file_path, sep='\t',index_col=False)
    selected_rows = [g.find(removable) == -1 for g in data['Gene Description']]
    print('Number of occurrences of {} in {} : {}'.format(removable,file_path,np.sum(np.array(selected_rows) == False)))
    data = data[selected_rows]
    data.to_csv(tmp_path,index=False)
remove_rows(train_file,train_tmp_file,removable_word)
remove_rows(test_file,test_tmp_file,removable_word)

Number of occurrences of endogenous control in ALL_AML_grow.train.orig.txt : 58
Number of occurrences of endogenous control in ALL_AML_grow.test.orig.txt : 58


Remove unnecessary columns and renaming "Gene Accession Number" column to "ID" :

In [4]:
def remove_unnecessary_columns(file_path) :
    data = pd.read_csv(file_path)
    data = data[np.append(data.columns[1],data.columns[2::2])]
    data.columns = np.append(['ID'],data.columns[1:])
    return data
train_data = remove_unnecessary_columns(train_tmp_file)
test_data = remove_unnecessary_columns(test_tmp_file)

Limiting Attributes Value, between 20 and 16000 :

In [5]:
def normalize_data(data,save_path) :
    def normalize_row(x) :
        if type(x) == str or (x > 20 and x < 16000) :
            return x
        else :
            if np.abs(x - 16000) > np.abs(x - 20) :
                return 20
            else :
                return 16000
    data = data.applymap(normalize_row)
    data.to_csv(save_path,index=False)
normalize_data(train_data,norm_train_file)
normalize_data(test_data,norm_test_file)

Transposing Matrix :

In [6]:
def tansposing_matrix(data_path,save_path) :
    data = pd.read_csv(data_path).transpose()
    print('Shape of {} : {}'.format(data_path,data.shape))
    data.to_csv(save_path)
tansposing_matrix(norm_train_file,gcol_train_file)
tansposing_matrix(norm_test_file,gcol_test_file)

Shape of ALL_AML_grow.train.norm.tmp : (39, 7071)
Shape of ALL_AML_grow.test.norm.tmp : (35, 7071)


Merging ClassId tables With Gcol tables :

In [7]:
pd.merge(pd.read_csv(gcol_test_file,header=1),
                   pd.read_csv(test_idclass_file),on="ID") \
                        .to_csv(gcol_class_test_file,index=False)
    
pd.merge(pd.read_csv(gcol_train_file,header=1),
                   pd.read_csv(train_idclass_file),on="ID") \
                        .to_csv(gcol_class_train_file,index=False)

In [11]:
pd.read_csv(gcol_class_test_file).head(5)

Unnamed: 0,ID,hum_alu_at,A28102_at,AB000114_at,AB000115_at,AB000220_at,AB000409_at,AB000449_at,AB000450_at,AB000460_at,...,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at,Class
0,39,16000,241,20,251,98,20,149,275,2573,...,1023,67,214,20,1074,475,48,168,20,ALL
1,40,16000,430,20,131,68,181,70,406,1963,...,529,20,352,20,67,263,20,20,20,ALL
2,42,16000,96,31,138,40,20,283,113,1082,...,399,20,558,24,893,297,20,1971,20,ALL
3,47,16000,65,20,1147,35,20,285,172,1548,...,277,20,81,20,722,170,20,510,20,ALL
4,48,11195,260,28,128,46,20,833,390,2222,...,643,51,450,20,612,370,29,333,20,ALL


In [10]:
pd.read_csv(gcol_class_train_file).head(5)

Unnamed: 0,ID,hum_alu_at,A28102_at,AB000114_at,AB000115_at,AB000220_at,AB000409_at,AB000449_at,AB000450_at,AB000460_at,...,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at,Class
0,1,15091,151,72,281,36,20,57,186,1647,...,511,20,389,20,793,329,36,191,20,ALL
1,2,11038,263,21,250,43,20,169,219,2043,...,837,20,442,20,782,295,20,76,20,ALL
2,3,16000,88,20,358,42,142,359,237,1997,...,1199,33,168,52,1138,777,41,228,20,ALL
3,4,15763,484,61,118,39,20,274,245,2128,...,835,218,174,20,627,170,20,126,20,ALL
4,5,16000,118,20,197,39,237,311,186,1608,...,649,57,504,20,250,314,20,56,20,ALL


Using ALL_AML_allgenes.train.arff as train file and ALL_AML_allgenes.test.arff as test, build a model using OneR. What accuracy do you get?

![title](images/result_with_id.PNG)

Now, excluding the field ID, build models using OneR, NaiveBayes Simple, and J4.8, using training set only.

OneR :

![title](images/result_oneR_no_id.PNG)

NaiveBayes :

![title](images/result_NB_no_id.PNG)

J4.8 :

![title](images/result_j48_no_id.PNG)

what three things are important in the process of data mining ?