In [None]:
import numpy as np
import os
import pdb
import traceback

from sklearn.svm import SVC
from sklearn.datasets import dump_svmlight_file

BASE_DIR = os.path.join("/var/data/users/sieradzki/mlls")
DATA_DIR = os.environ.get("MLLS_DATA_DIR", os.path.join(BASE_DIR, "data"))
CACHE_DIR = os.environ.get("MLLS_CACHE_DIR", os.path.join(BASE_DIR, "cache"))
RESULTS_DIR = os.environ.get("MLLS_RESULTS_DIR", os.path.join(BASE_DIR, "results"))
LOG_DIR = os.environ.get("MLLS_LOG_DIR", os.path.join(BASE_DIR, "log"))

MACCS_SIZE = 161

In [None]:
active_files = filter(lambda x: "_actives" in x, os.listdir(os.path.join(DATA_DIR, "orig")))
n = len(active_files)

for i, active_file in enumerate(active_files):

    X = []
    y = []

    # read actives
    with open(os.path.join(DATA_DIR, "orig", active_file), 'r') as f:
        active_content = f.read()

    # process actives
    for line in active_content.splitlines():
        X.append(line.split(":")[1].split(','))
        y.append(1)
        assert len(X[-1]) == MACCS_SIZE, len(X[-1])

    # read inactives
    inactive_file_name = active_file.split("_")[0] + "_inactives_" + "_".join(active_file.split("_")[-3:])
    inactive_file = os.path.join(DATA_DIR, "orig", inactive_file_name)

    assert os.path.exists(inactive_file), inactive_file
    with open(inactive_file, 'r') as f:
        inactive_content = f.read()

    # process inactives
    for line in inactive_content.splitlines():
        X.append(line.split(":")[1].split(','))
        y.append(0)
        assert len(X[-1]) == MACCS_SIZE, len(X[-1])

    # checks
    n_lines = len(active_content.splitlines()) + len(inactive_content.splitlines())
    assert len(X) == n_lines

    # make arrays
    X = np.array(X, dtype=int)
    y = np.array(y, dtype=int)

    # more chekcs
    assert X.shape[0] == y.shape[0] == n_lines
    assert X.shape[1] == MACCS_SIZE

    # dump libsvm file
    dump_file_name = active_file.split("_")[0] + "_MACCS.libsvm"
    with open(os.path.join(DATA_DIR, "new_maccs", dump_file_name), 'wb') as f:
        dump_svmlight_file(X, y, f, zero_based=True)

    print "Done: %d/%d" % (i+1, n)

In [None]:
# check saved files
new_files = os.listdir(os.path.join(DATA_DIR, "new_maccs"))
for i, new_file in enumerate(new_files):
    X, y = load_svmlight_file(os.path.join(DATA_DIR, "new_maccs", new_file), zero_based=True)
    
    assert X.shape[0] == y.shape[0]
    assert X.shape[1] == MACCS_SIZE, X.shape[1]