In [9]:
import re
import os
import requests
import zipfile
import math
import pandas as pd
import numpy as np
from tqdm import tqdm

In [41]:
root_url = 'http://www.kdnuggets.com/data_mining_course/data/'
root_data = 'data'
train_data_file = 'ALL_AML_gr.thr.train.csv'
data_file = 'genes-leukemia.csv'
reduced_data_file = 'genes-reduced.csv'

In [46]:
def unzip_file(file_addr,output_dir,remove=False) :
    with zipfile.ZipFile(file_addr,"r") as zip_ref:
        zip_ref.extractall(output_dir)
    if remove :  
        os.remove(file_addr)
    return os.listdir(output_dir)

def download_file(url,output_file,unzip=False,output_dir='') :
    if not os.path.exists(os.path.join(output_dir,output_file)) :
        # Streaming, so we can iterate over the response.
        r = requests.get(url, stream=True)

        # Total size in bytes.
        total_size = int(r.headers.get('content-length', 0)); 
        block_size = 1024
        wrote = 0 
        with open(os.path.join(output_dir,output_file), 'wb') as f:
            for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True):
                wrote = wrote  + len(data)
                f.write(data)
        if total_size != 0 and wrote != total_size:
            print("ERROR, something went wrong")
    if unzip :
        return unzip_file(os.path.join(output_dir,output_file),output_dir,True)
    return os.path.join(output_dir,output_file)

file = download_file(root_url + data_file,data_file,False,root_data)

In [47]:
data = pd.read_csv(file)
data

Unnamed: 0,SNUM,CLASS,BM_PB,TB_if_ALL,FAB_if_AML,Year,Gender,pct_Blasts,Treatment_Response,PS,...,AF012024_s,D26156_s,M84371_rna1_s,M31211_s,U09087_s,U26266_s,L49229_f,M31523,M28170,U29175
0,s1,ALL,BM,B-cell,?,1996,M,?,?,1.00,...,257,1595,2911,601,358,289,337,1320,397,1582
1,s2,ALL,BM,T-cell,?,?,M,?,?,0.41,...,46,822,575,435,82,288,131,898,20,624
2,s3,ALL,BM,T-cell,?,?,M,?,?,0.87,...,139,1452,905,547,263,447,529,597,183,753
3,s4,ALL,BM,B-cell,?,?,?,?,?,0.91,...,168,654,2038,472,218,424,422,1644,363,743
4,s5,ALL,BM,B-cell,?,?,?,?,?,0.89,...,94,1011,1871,661,186,364,354,1322,251,626
5,s6,ALL,BM,T-cell,?,?,M,?,?,0.76,...,197,1584,634,337,209,320,327,787,74,1157
6,s7,ALL,BM,B-cell,?,1983,F,?,?,0.78,...,188,578,2364,309,144,381,354,946,280,552
7,s8,ALL,BM,B-cell,?,?,F,?,?,0.77,...,61,1024,1409,263,167,53,232,1917,235,572
8,s9,ALL,BM,T-cell,?,?,M,?,?,0.89,...,269,1297,644,978,385,869,568,1440,24,1776
9,s10,ALL,BM,T-cell,?,1987,M,?,?,0.56,...,99,971,358,752,146,403,20,442,20,756


In [48]:
none_missing_tr_data = data[data['Treatment_Response'] != '?']
none_missing_tr_data

Unnamed: 0,SNUM,CLASS,BM_PB,TB_if_ALL,FAB_if_AML,Year,Gender,pct_Blasts,Treatment_Response,PS,...,AF012024_s,D26156_s,M84371_rna1_s,M31211_s,U09087_s,U26266_s,L49229_f,M31523,M28170,U29175
27,s28,AML,BM,?,M2,?,?,79,Failure,0.44,...,43,893,443,300,91,151,20,353,20,429
28,s29,AML,BM,?,M2,?,?,34,Failure,0.74,...,50,624,225,237,58,66,20,279,35,314
29,s30,AML,BM,?,M5,?,?,93,Failure,0.8,...,20,388,251,100,20,20,60,250,20,208
30,s31,AML,BM,?,M4,?,?,77,Failure,0.61,...,45,432,595,192,61,20,56,381,20,385
31,s32,AML,BM,?,M1,?,?,86,Failure,0.47,...,68,706,863,339,50,192,88,671,20,688
32,s33,AML,BM,?,M2,?,?,70,Failure,0.89,...,20,736,678,59,20,20,20,200,20,248
33,s34,AML,BM,?,M2,?,?,77,Success,0.64,...,28,302,469,153,34,282,20,299,20,454
34,s35,AML,BM,?,M1,?,?,67,Success,0.21,...,50,201,548,20,48,43,38,389,67,391
35,s36,AML,BM,?,M5,?,?,76,Success,0.94,...,29,667,763,88,74,64,20,126,20,412
36,s37,AML,BM,?,M2,?,?,44,Success,0.95,...,98,697,466,139,52,20,93,190,20,393


In [49]:
print('There is {} samples with none missing treatment_response value in the data' 
      .format(none_missing_tr_data.shape[0]))

There is 15 samples with none missing treatment_response value in the data


In [50]:
none_missing_tr_data.describe()

Unnamed: 0,PS,D49950,D63880,J03473,J05243,L13278,L47738,M21551_rna1,M55150,M62762,...,AF012024_s,D26156_s,M84371_rna1_s,M31211_s,U09087_s,U26266_s,L49229_f,M31523,M28170,U29175
count,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,...,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0
mean,0.726,230.333333,200.266667,542.066667,151.466667,24.933333,255.8,429.8,1717.866667,3069.733333,...,44.6,592.0,542.533333,149.466667,45.066667,95.533333,41.266667,283.133333,31.733333,359.8
std,0.22959,97.389987,100.603512,204.579871,158.522133,11.20884,214.475839,114.143519,423.082036,976.326897,...,24.094457,270.763207,198.136269,95.727938,21.137194,110.494904,24.760183,143.173654,25.417842,122.862409
min,0.21,70.0,20.0,109.0,20.0,20.0,20.0,259.0,801.0,1125.0,...,20.0,20.0,225.0,20.0,20.0,20.0,20.0,110.0,20.0,190.0
25%,0.61,149.5,133.5,428.0,52.5,20.0,91.5,340.5,1524.0,2411.0,...,28.5,410.0,454.5,72.5,26.5,20.0,20.0,193.5,20.0,268.0
50%,0.74,259.0,185.0,500.0,102.0,20.0,201.0,415.0,1707.0,3266.0,...,41.0,667.0,530.0,139.0,48.0,43.0,38.0,275.0,20.0,382.0
75%,0.915,304.0,278.0,673.0,221.5,21.0,447.5,486.0,1912.0,3573.5,...,50.0,728.5,677.0,214.5,55.0,135.0,54.0,367.0,27.5,402.5
max,1.0,355.0,410.0,871.0,628.0,60.0,683.0,672.0,2693.0,4647.0,...,98.0,1024.0,863.0,339.0,91.0,376.0,93.0,671.0,110.0,688.0


### Why is it not correct to build predictive models for TREATMENT_RESPONSE using records where it is missing?
?


In [52]:
one_value_columns = []
for name in none_missing_tr_data.columns :
    if len(none_missing_tr_data.groupby(name).size()) == 1 :
        one_value_columns.append(name)
reduced_data = none_missing_tr_data.drop(one_value_columns,axis=1)
reduced_data.to_csv(os.path.join(root_data,reduced_data_file))

In [57]:
print('Samples Field that i should keep : \n{}'.format([column for column in none_missing_tr_data.columns if column not in one_value_columns]))

Samples Field that i should keep : 
['SNUM', 'BM_PB', 'FAB_if_AML', 'pct_Blasts', 'Treatment_Response', 'PS', 'D49950', 'D63880', 'J03473', 'J05243', 'L13278', 'L47738', 'M21551_rna1', 'M55150', 'M62762', 'M81933', 'M91432', 'S50223', 'U12471_cds1', 'U32944', 'U35451', 'U50136_rna1', 'U53468', 'U72342', 'U82759', 'X15949', 'X52142', 'X56411_rna1', 'X63469', 'X74262', 'X76061', 'X76648', 'X95735', 'Y08612', 'Y12670', 'D38073', 'AF012024_s', 'D26156_s', 'M84371_rna1_s', 'M31211_s', 'U09087_s', 'U26266_s', 'L49229_f', 'M31523', 'M28170', 'U29175']
