In [41]:
import numpy as np
import pandas as pd
from collections import OrderedDict

In [68]:
#read in the observations data. multiple observations for each of the ~8K objects
df = pd.read_csv('training_set.csv')

In [69]:
#easy dict for reassigning passband index to their letters
pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'Y')])
pbnames= list(pbmap.values())

In [76]:
#create the flux features we want and fix the columns. output should be rows of length~8K (one per object)
#and one column for each passband/feature combination: 30
feats = df.groupby(['object_id', 'passband'])['flux'].agg(['mean', 'median', 'std', 'max', 'min'])
feats = feats.unstack(level='passband')
newcols=[]
for x in feats.columns.levels[1]:
    newcols.append([y + "_" + pbmap[x] for y in feats.columns.levels[0]])
feats.columns =[item for sublist in newcols for item in sublist]
feats.head()

Unnamed: 0_level_0,mean_u,median_u,std_u,max_u,min_u,mean_g,median_g,std_g,max_g,min_g,...,mean_z,median_z,std_z,max_z,min_z,mean_Y,median_Y,std_Y,max_Y,min_Y
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,-3.254554,-385.699911,-134.146566,-121.103501,-55.954592,-47.449847,-10.015225,-488.057969,-265.686004,-162.170944,...,611.984558,445.737061,381.953735,378.188141,-116.913223,-1100.440063,-681.858887,-530.644592,-422.184509,-422.815094
713,-2.720398,-1.019804,-0.794238,-0.986966,-0.900262,-1.794175,-3.096804,-0.561735,-0.117977,-0.073896,...,10.529041,11.330316,9.827934,14.770886,-14.735178,-11.715749,-10.067919,-12.394593,-12.286801,-14.211164
730,-0.04808,0.141057,2.40087,3.236164,4.308728,4.539396,0.024093,0.171336,0.49179,0.660403,...,20.994711,33.572102,41.159981,47.310059,-3.45996,-3.39308,-2.848838,-5.435799,-5.83631,-19.159811
745,1.797523,5.717394,9.711532,14.412924,13.134436,10.746138,1.056714,0.888115,0.42436,1.361369,...,220.795212,203.250702,183.633118,141.51329,-3.874349,-3.61841,-2.159753,-4.944036,-15.494463,-10.249387
1124,0.660948,4.634637,10.243968,11.086555,9.906102,6.896742,0.581027,1.154596,0.889142,1.014105,...,106.671692,139.818405,143.600189,109.157585,-6.804703,-2.622109,-2.084535,-2.80027,-16.543753,-10.86054


In [77]:
#create the flux_error features we want and merge with the flux features above.

errs = df.groupby(['object_id', 'passband'])['flux_err'].agg(['std'])
errs = errs.unstack(level='passband')
newcols=[]
for x in errs.columns.levels[1]:
    newcols.append(["errstd" + "_" + pbmap[x] for y in errs.columns.levels[0]])
errs.columns =[item for sublist in newcols for item in sublist]

In [78]:
#output should be rows of length~8K (one per object)
#and one column for each passband/feature combination, now 36
feats = feats.merge(errs, left_index=True, right_index=True)

In [79]:
#read in metadata and merge that as well. output is now 8K rows and 48 columns: 
#30 flux feats, 6 flux_error feats, and 12 metadata columns
metadata = pd.read_csv('training_set_metadata.csv')
metadata.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [80]:
feats = feats.reset_index()
feats = feats.merge(metadata, left_on='object_id', right_on='object_id')

In [85]:
#Write to csv
feats.to_csv('trainingfeats.csv', index=False)
