In [1]:
import os.path

from collections import OrderedDict
import pandas as pd
import numpy as np

dtypes = OrderedDict(
    [
        ("object_id", "int32"),
        ("mjd", "float32"),
        ("passband", "int32"),
        ("flux", "float32"),
        ("flux_err", "float32"),
        ("detected", "int32"),
    ]
)

columns_names = [
    "object_id",
    "ra",
    "decl",
    "gal_l",
    "gal_b",
    "ddf",
    "hostgal_specz",
    "hostgal_photoz",
    "hostgal_photoz_err",
    "distmod",
    "mwebv",
    "target",
]
meta_dtypes = ["int32"] + ["float32"] * 4 + ["int32"] + ["float32"] * 5 + ["int32"]
meta_dtypes = OrderedDict(
    [(columns_names[i], meta_dtypes[i]) for i in range(len(meta_dtypes))]
)

path = "/localdisk/gregory/benchmark-datasets"

In [2]:
def print_all(df):
    def print_df(s, name):
        print(name, " = \n", s)
        print(name, "shape = ", s.shape)

    dt = df.dtypes
    print_df(dt, "dtypes")
    mi = df.min()
    print_df(mi, "minimum")
    ma = df.max()
    print_df(ma, "maximum")

    result = pd.DataFrame({"types": dt, "min": mi, "max": ma})
    result = result.reindex(dt.index)
    print_df(result, "result")

    for index, row in result.iterrows():
        print("\"{0}\": (\"{1}\", {2}, {3}),".format(index,
            row[0],
            np.int64(row[1]) if row[0].name == "int64" or row[0].name == "int32" else row[1],
            np.int64(row[2]) if row[0].name == "int64" or row[0].name == "int32" else row[2]))

In [3]:
train_df = pd.read_csv(os.path.join(path, "plasticc", "training_set.csv"), dtype=dtypes)

train_df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.421875,2,-544.810303,3.622952,1
1,615,59750.429688,1,-816.434326,5.553370,1
2,615,59750.437500,3,-471.385529,3.801213,1
3,615,59750.445312,4,-388.984985,11.395031,1
4,615,59752.406250,2,-681.858887,4.041204,1
...,...,...,...,...,...,...
1421700,130779836,60555.984375,4,-39.881969,46.477093,0
1421701,130779836,60560.046875,1,14.894439,18.947685,0
1421702,130779836,60571.023438,5,30.593130,50.695290,0
1421703,130779836,60585.996094,4,-23.471439,44.819859,0


In [4]:
print("training_set!")
print_all(train_df)

training_set!
dtypes  = 
 object_id      int32
mjd          float32
passband       int32
flux         float32
flux_err     float32
detected       int32
dtype: object
dtypes shape =  (6,)
minimum  = 
 object_id    6.150000e+02
mjd          5.958004e+04
passband     0.000000e+00
flux        -1.149388e+06
flux_err     4.637530e-01
detected     0.000000e+00
dtype: float64
minimum shape =  (6,)
maximum  = 
 object_id    1.307798e+08
mjd          6.067436e+04
passband     5.000000e+00
flux         2.432809e+06
flux_err     2.234069e+06
detected     1.000000e+00
dtype: float64
maximum shape =  (6,)
result  = 
              types           min           max
object_id    int32  6.150000e+02  1.307798e+08
mjd        float32  5.958004e+04  6.067436e+04
passband     int32  0.000000e+00  5.000000e+00
flux       float32 -1.149388e+06  2.432809e+06
flux_err   float32  4.637530e-01  2.234069e+06
detected     int32  0.000000e+00  1.000000e+00
result shape =  (6, 3)
"object_id": ("int32", 615, 130779836

In [5]:
test_df = pd.read_csv(
    os.path.join(path, "plasticc", "test_set.csv"),
    names=list(dtypes.keys()),
    header=1,
    dtype=dtypes)

test_df

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,13,59798.328125,1,-2.095392,1.148654,0
1,13,59798.335938,3,-0.923794,1.763655,0
2,13,59798.347656,4,-4.009815,2.602911,0
3,13,59798.359375,5,-3.403503,5.367328,0
4,13,59801.355469,2,-1.778855,2.448943,0
...,...,...,...,...,...,...
453653098,104853812,60529.097656,1,23.212770,4.839886,0
453653099,104853812,60540.089844,5,72.878868,21.710670,0
453653100,104853812,60544.035156,3,48.116238,5.542284,1
453653101,104853812,60548.023438,1,2.741242,4.695860,0


In [6]:
print("test_set!")
print_all(test_df)

test_set!
dtypes  = 
 object_id      int32
mjd          float32
passband       int32
flux         float32
flux_err     float32
detected       int32
dtype: object
dtypes shape =  (6,)
minimum  = 
 object_id    1.300000e+01
mjd          5.958004e+04
passband     0.000000e+00
flux        -8.935484e+06
flux_err     4.637530e-01
detected     0.000000e+00
dtype: float64
minimum shape =  (6,)
maximum  = 
 object_id    1.307881e+08
mjd          6.067436e+04
passband     5.000000e+00
flux         1.367579e+07
flux_err     1.379167e+07
detected     1.000000e+00
dtype: float64
maximum shape =  (6,)
result  = 
              types           min           max
object_id    int32  1.300000e+01  1.307881e+08
mjd        float32  5.958004e+04  6.067436e+04
passband     int32  0.000000e+00  5.000000e+00
flux       float32 -8.935484e+06  1.367579e+07
flux_err   float32  4.637530e-01  1.379167e+07
detected     int32  0.000000e+00  1.000000e+00
result shape =  (6, 3)
"object_id": ("int32", 13, 130788054),
"m

In [7]:
train_meta_df = pd.read_csv(
    os.path.join(path, "plasticc", "training_set_metadata.csv"), dtype=meta_dtypes)

train_meta_df

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.796539,-51.753708,1,0.0000,0.0000,0.0000,,0.017,92
1,713,53.085938,-27.784405,223.525513,-54.460747,1,1.8181,1.6267,0.2552,45.406300,0.007,88
2,730,33.574219,-6.579593,170.455582,-61.548218,1,0.2320,0.2262,0.0157,40.256100,0.021,42
3,745,0.189873,-45.586655,328.254456,-68.969299,1,0.3037,0.2813,1.1523,40.795101,0.007,90
4,1124,352.711273,-63.823658,316.922302,-51.059402,1,0.1934,0.2415,0.0176,40.416599,0.024,90
...,...,...,...,...,...,...,...,...,...,...,...,...
7843,130739978,26.718750,-14.940303,172.342697,-72.255676,0,0.0000,0.0000,0.0000,,0.013,65
7844,130755807,120.101349,-62.696659,275.742950,-16.509747,0,0.1725,2.5606,1.1146,46.610802,0.136,90
7845,130762946,203.108109,-55.682144,308.728912,6.727511,0,0.0000,0.0000,0.0000,,0.430,16
7846,130772921,79.101562,-35.501846,239.172241,-33.827843,0,0.0000,0.0000,0.0000,,0.034,65


In [8]:
print("training_set_metadata!")
print_all(train_meta_df)

training_set_metadata!
dtypes  = 
 object_id               int32
ra                    float32
decl                  float32
gal_l                 float32
gal_b                 float32
ddf                     int32
hostgal_specz         float32
hostgal_photoz        float32
hostgal_photoz_err    float32
distmod               float32
mwebv                 float32
target                  int32
dtype: object
dtypes shape =  (12,)
minimum  = 
 object_id             615.000000
ra                      0.175781
decl                  -64.760857
gal_l                   0.107681
gal_b                 -89.615570
ddf                     0.000000
hostgal_specz           0.000000
hostgal_photoz          0.000000
hostgal_photoz_err      0.000000
distmod                31.996099
mwebv                   0.003000
target                  6.000000
dtype: float64
minimum shape =  (12,)
maximum  = 
 object_id             1.307798e+08
ra                    3.598242e+02
decl                  4.181528e+00
gal_

In [9]:
target = meta_dtypes.pop("target")
test_meta_df = pd.read_csv(
    os.path.join(path, "plasticc", "test_set_metadata.csv"), dtype=meta_dtypes)

test_meta_df

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv
0,13,34.453125,-5.229529,169.987076,-59.956184,1,0.3048,0.3193,0.0542,41.112301,0.019
1,14,33.398438,-4.331149,167.226334,-59.936550,1,,0.6323,0.0179,42.877399,0.018
2,17,348.529419,-61.755440,321.293976,-51.763351,1,,0.8297,0.0605,43.599998,0.016
3,23,34.804688,-5.829153,171.307861,-60.174400,1,,0.6533,0.1479,42.964001,0.023
4,34,351.321442,-64.198746,317.458984,-50.429932,1,0.4557,0.4617,0.0122,42.054001,0.023
...,...,...,...,...,...,...,...,...,...,...,...
3492885,130787966,67.500000,-23.806295,221.904510,-40.940434,0,,0.4493,0.9954,41.983601,0.036
3492886,130787971,98.789062,-32.974850,241.585052,-17.528223,0,,0.6729,0.0614,43.041901,0.083
3492887,130787974,133.945312,-21.542267,247.349365,15.069447,0,,0.1211,0.0093,38.760399,0.136
3492888,130788053,199.160156,-0.895283,316.152863,61.327850,0,,0.4287,0.2616,41.862499,0.028


In [10]:
print("test_set_metadata!")
print_all(test_meta_df)

test_set_metadata!
dtypes  = 
 object_id               int32
ra                    float32
decl                  float32
gal_l                 float32
gal_b                 float32
ddf                     int32
hostgal_specz         float32
hostgal_photoz        float32
hostgal_photoz_err    float32
distmod               float32
mwebv                 float32
dtype: object
dtypes shape =  (11,)
minimum  = 
 object_id             13.000000
ra                     0.000000
decl                 -64.760857
gal_l                  0.010369
gal_b                -89.674416
ddf                    0.000000
hostgal_specz          0.007700
hostgal_photoz         0.000000
hostgal_photoz_err     0.000000
distmod               27.646200
mwebv                  0.002000
dtype: float64
minimum shape =  (11,)
maximum  = 
 object_id             1.307881e+08
ra                    3.598242e+02
decl                  4.181528e+00
gal_l                 3.599955e+02
gal_b                 6.606870e+01
ddf         