# Download APK Dataset

In [29]:
import os
import configparser

config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation(), allow_no_value=True)
config.read('config.ini')

DATASET_URL = 'https://androzoo.uni.lu/static/lists/latest_with-added-date.csv.gz'

APK_DIR = config['PATHS']['apk_dir']

if not os.path.exists(APK_DIR):
    os.makedirs(APK_DIR)

DATASET_PATH = os.path.join(APK_DIR, config['ANDROZOO']['datset_name'])

### Download

In [4]:
from urllib.request import urlretrieve
import progressbar

pbar = None

def show_progress_bar(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(max_value=total_size)
        pbar.start()

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

if os.path.exists(DATASET_PATH+'.csv.gz'):
    path, headers = urlretrieve(DATASET_URL, DATASET_PATH+'.csv.gz', show_progress_bar)

  0% (0 of 3278234940) |                 | Elapsed Time: 0:00:00 ETA:  --:--:--
  0% (16384 of 3278234940) |             | Elapsed Time: 0:00:00 ETA:   6:50:28
  0% (32768 of 3278234940) |             | Elapsed Time: 0:00:00 ETA:  12:01:55
  0% (49152 of 3278234940) |             | Elapsed Time: 0:00:00 ETA:  10:32:24
  0% (57344 of 3278234940) |             | Elapsed Time: 0:00:00 ETA:  11:27:14
  0% (81920 of 3278234940) |             | Elapsed Time: 0:00:00 ETA:   9:29:47
  0% (106496 of 3278234940) |            | Elapsed Time: 0:00:00 ETA:   8:26:51
  0% (131072 of 3278234940) |            | Elapsed Time: 0:00:01 ETA:   7:59:24
  0% (163840 of 3278234940) |            | Elapsed Time: 0:00:01 ETA:   7:32:22
  0% (196608 of 3278234940) |            | Elapsed Time: 0:00:01 ETA:   7:13:15
  0% (229376 of 3278234940) |            | Elapsed Time: 0:00:01 ETA:   6:43:32
  0% (278528 of 3278234940) |            | Elapsed Time: 0:00:01 ETA:   5:59:30
  0% (311296 of 3278234940) |           

KeyboardInterrupt: 

### Unzip

In [None]:
import gzip
import shutil
import os

if os.path.exists(DATASET_PATH+'.csv.gz'):
    print("Unzipping...")
    with gzip.open(DATASET_PATH+'.csv.gz', 'rb') as f_in:
        with open(DATASET_PATH+'.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    os.remove(DATASET_PATH+'.csv.gz') # remove gzip file once it is successfully unzipped

    print("Done.")

### Convert to HDF5

In [3]:
import vaex
import os

if not os.path.exists(DATASET_PATH+'.hdf5'):
    hdf5 = vaex.open(DATASET_PATH+'.csv')
    print("CSV Shape:", hdf5.shape)
    hdf5.export(DATASET_PATH+'.hdf5')
    hdf5 = vaex.open(DATASET_PATH+'.hdf5')
    print("HDF5 Shape:", hdf5.shape)

CSV Shape: (23298026, 12)


  self.to_array[target_set_item] = values


HDF5 Shape: (23298026, 12)


### Separate Benign and Malign Samples

In [33]:
import vaex

apk_list = vaex.open(DATASET_PATH+'.hdf5')

benign = apk_list[apk_list['vt_detection'] == 0.0]
benign = benign.head(10)

malign = apk_list[apk_list['vt_detection'] > 5]
malign = malign.head(10)

sample = vaex.concat([benign, malign])
sample.export_csv(os.path.join(APK_DIR, 'sample.csv'))