## Load and Convert Train
### Chengxuan Ying

In [1]:
from tqdm import tqdm

name2id = {
    'star': 0,
    'galaxy': 1,
    'qso': 2,
}
filename = 'trainval/update_new_columns_trains_sets.csv'

features = []
targets = []
rowids = []

with open(filename) as f:
    for idx, line in tqdm(enumerate(f)):
        if 'FE' in line:
            continue
        
        contents = line.split(',')
        
        feature = [float(x) for x in contents[:-2]]
        target = name2id[contents[-2]]
        rowid = contents[-1]
        
        features.append(feature)
        targets.append(target)
        rowids.append(rowid)

573475it [12:07, 787.78it/s]


In [2]:
import numpy as np
features_np = np.array(features, dtype=np.float32)
print(features_np.shape)
np.save('train_features.npy', features_np)
targets_np = np.array(targets, dtype=np.int8)
print(targets_np.shape)
np.save('train_targets.npy', targets_np)

(573417, 2600)
(573417,)
(573417,)


## Load and Convert Val

In [4]:
from tqdm import tqdm
filename = 'trainval/val_sets_v1.csv'

features = []
targets = []
rowids = []

with open(filename) as f:
    for idx, line in tqdm(enumerate(f)):
        if 'FE' in line:
            continue

        contents = line.split(',')
        
        feature = [float(x) for x in contents[:-1]]
        rowid = contents[-1]
        
        features.append(feature)
        rowids.append(rowid)

190625it [03:54, 811.64it/s]


In [5]:
import numpy as np
features_np = np.array(features, dtype=np.float32)
print(features_np.shape)
np.save('val_features.npy', features_np)

(190624, 2600)


In [7]:
targets = []
filename = 'trainval/val_labels_v1.csv'
name2id = {
    'star': 0,
    'galaxy': 1,
    'qso': 2,
}
with open(filename) as f:
    for idx, line in tqdm(enumerate(f)):
        if 'id' in line:
            continue
        contents = line.strip().split(',')
        target = name2id[contents[-1]]
        targets.append(target)
targets_np = np.array(targets, dtype=np.int8)
print(targets_np.shape)
np.save('val_targets.npy', targets_np)

190625it [00:00, 768272.80it/s]

(190624,)





## Load and Convert Test

In [8]:
from tqdm import tqdm
filenames = ['test/test_sets_'+str(i)+'.csv' for i in range(10)]

features = []
rowids = []

for filename in filenames:
    with open(filename) as f:
        for idx, line in tqdm(enumerate(f)):
            if 'FE' in line:
                continue

            contents = line.split(',')

            feature = [float(x) for x in contents[:-1]]
            rowid = contents[-1]

            features.append(feature)
            rowids.append(rowid)

20001it [00:22, 872.81it/s]
20001it [00:26, 764.43it/s]
20001it [00:25, 772.02it/s]
20001it [00:23, 839.11it/s]
20001it [00:25, 778.96it/s]
20001it [00:24, 822.39it/s]
20001it [00:24, 800.49it/s]
20001it [00:31, 636.93it/s]
20001it [00:23, 839.30it/s]
10625it [00:12, 851.83it/s]


In [9]:
import numpy as np
features_np = np.array(features, dtype=np.float32)
print(features_np.shape)
np.save('test_features.npy', features_np)
rowids_np = np.array(rowids)
print(rowids_np.shape)
np.save('test_rowids.npy', rowids_np)

(190624, 2600)
(190624,)


## Trainval Split

In [10]:
import numpy as np
tra_features = np.load('train_features.npy').astype(np.float32)
tra_targets = np.load('train_targets.npy').astype(np.int64)
val_features = np.load('val_features.npy').astype(np.float32)
val_targets = np.load('val_targets.npy').astype(np.int64)
tra_features.shape, val_features.shape

((573417, 2600), (190624, 2600))

In [11]:
all_features = np.vstack([tra_features, val_features])
all_features.shape

(764041, 2600)

In [14]:
all_targets = np.vstack([tra_targets[:,None], val_targets[:,None]])
all_targets.shape

(764041, 1)

In [16]:
from sklearn.model_selection import train_test_split
traval_tra_features, traval_val_features, traval_tra_targets, traval_val_targets = \
    train_test_split(all_features, all_targets, test_size=0.1, random_state=42)

In [17]:
traval_tra_features.shape, traval_val_features.shape, \
traval_tra_targets.shape, traval_val_targets.shape

((687636, 2600), (76405, 2600), (687636, 1), (76405, 1))

In [18]:
np.save('traval_tra_features.npy', traval_tra_features)
np.save('traval_val_features.npy', traval_val_features)
np.save('traval_tra_targets.npy', traval_tra_targets)
np.save('traval_val_targets.npy', traval_val_targets)