/
read_numerai.py
94 lines (80 loc) · 3.59 KB
/
read_numerai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import sys
import numpy as np
import pandas as pd
from pprint import pprint
def read_numerai(path, num_examples, testing_percent, vector_labels):
# Check testing percentage
if(testing_percent >= 1 or testing_percent < 0):
raise ValueError("The testing percentage must be less than 1 and greater than 0: %d" % testing_percent)
# Adjust path ending if necassary
if(path[-1] != '/'):
path = path + '/'
# Set training and testing file names
training_filename = "numerai_training_data.csv"
tournament_filename = "numerai_tournament_data.csv"
# Read raw data
raw_training_data = pd.read_csv(path + training_filename)
tourney_exists = False
try:
raw_tournament_data = pd.read_csv(path + tournament_filename)
tourney_exists = True
except IOError:
pass
# Make sure number of samples does not exceed count of traning and testing data
max_train_count = int(raw_training_data.shape[0] * (1 - testing_percent))
max_test_count = int(raw_training_data.shape[0] - max_train_count)
if(num_examples > max_train_count + max_test_count):
raise ValueError("The number of samples cannot exceed the sum of the training and testing data: %d" % num_examples)
# Set the number of training and testing examples
train_count = int(num_examples * (1 - testing_percent))
test_count = int(num_examples - train_count)
# Extract training, testing, and tournament data
train_features = raw_training_data[:train_count].as_matrix(columns=raw_training_data.columns[:-1])
if(test_count != 0):
test_features = raw_training_data[-test_count:].as_matrix(columns=raw_training_data.columns[:-1])
else:
test_features = []
tourney_features = None
tourney_ids = None
if(tourney_exists):
tourney_features = raw_tournament_data.as_matrix(columns=raw_tournament_data.columns[1:])
tourney_ids = raw_tournament_data['t_id'].values
# Setup labels as single- or multi-dimensional structures
if(vector_labels):
train_labels = np.ndarray(shape=(train_count, 2))
for i in range(train_count):
if(raw_training_data['target'].values[i] == 0):
train_labels[i,:] = [1, 0]
else:
train_labels[i,:] = [0, 1]
if(test_count != 0):
test_labels = np.ndarray(shape=(test_count, 2))
for i in range(test_count):
if(raw_training_data['target'].values[-test_count + i] == 0):
test_labels[i,:] = [1, 0]
else:
test_labels[i,:] = [0, 1]
else:
test_labels = []
else:
train_labels = raw_training_data['target'][:train_count-1].values
if(test_count != 0):
test_labels = raw_training_data['target'][-test_count:].values
else:
test_labels = []
return train_features, test_features, tourney_features, train_labels, test_labels, tourney_ids
if __name__ == "__main__":
if len(sys.argv) != 5:
print("Please provide valid input parameters ([path to numerai files] [number of exaples] [percentage of training for testing] [vector labels?])")
exit(1)
path = sys.argv[1]
num_examples = int(sys.argv[2])
test_perc = float(sys.argv[3])
vector_labels = int(sys.argv[4])
train_features, test_features, tourney_features, train_labels, test_labels, tourney_ids = read_numerai(path, num_examples, test_perc, vector_labels)
pprint(train_features)
pprint(test_features)
pprint(tourney_features)
pprint(train_labels)
pprint(test_labels)
pprint(tourney_ids)