In [52]:
%matplotlib inline

import os
from collections import Counter, OrderedDict
import numpy as np
import matplotlib.pyplot as plt
from cesium.time_series import TimeSeries
import cesium.featurize as featurize
from tqdm import tnrange, tqdm_notebook
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
pd.options.mode.use_inf_as_na = True

In [44]:
metafilename = 'training_set_metadata.csv'
metadata = pd.read_csv(metafilename)
metadata.head(1)

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92


In [45]:
#Load previously generated cesium feature file
featurefile = 'C:/Users/Greg/Documents/Personal/PLAsTiCC/plasticc_featuretablemore.npz'
featuretable, _ = featurize.load_featureset(featurefile)
featuretable = featuretable.reset_index(level=0)
featuretable.head()

feature,index,all_times_nhist_numpeaks,all_times_nhist_numpeaks,all_times_nhist_numpeaks,all_times_nhist_numpeaks,all_times_nhist_numpeaks,all_times_nhist_numpeaks,all_times_nhist_peak_val,all_times_nhist_peak_val,all_times_nhist_peak_val,...,std,weighted_average,weighted_average,weighted_average,weighted_average,weighted_average,weighted_average,mwebv,z,zerr
channel,Unnamed: 1_level_1,0,1,2,3,4,5,0,1,2,...,5,0,1,2,3,4,5,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,615,18.0,3.0,3.0,3.0,3.0,4.0,0.006097,0.00405,0.00405,...,292.182295,-17.061118,-212.397193,-102.220639,-101.206639,-54.744845,-59.688379,0.017,0.0,0.0
1,713,15.0,3.0,3.0,3.0,3.0,3.0,0.006491,0.004738,0.004738,...,7.030448,-3.500958,-1.322397,-1.030469,-1.382941,-1.407879,-1.876399,0.007,1.6267,0.2552
2,730,16.0,7.0,7.0,7.0,7.0,7.0,0.006176,0.00424,0.00424,...,13.201397,-0.016423,-0.03417,2.059833,2.988513,4.486335,5.05769,0.021,0.2262,0.0157
3,745,18.0,4.0,4.0,4.0,4.0,4.0,0.005318,0.004374,0.004374,...,25.822133,1.176322,3.652226,6.716857,12.514694,12.247387,8.760515,0.007,0.2813,1.1523
4,1124,18.0,3.0,3.0,3.0,3.0,4.0,0.006097,0.00405,0.00405,...,21.245772,0.82438,3.617169,7.842645,8.830427,8.463856,5.602845,0.024,0.2415,0.0176


In [46]:
#Flatten index of the column names
old_names = featuretable.columns.values
pbmap = OrderedDict([(0,'u'), (1,'g'), (2,'r'), (3,'i'), (4, 'z'), (5, 'Y')])
pbnames = list(pbmap.values())
new_names = ['{}_{}'.format(x, pbmap.get(y,'meta')) for x,y in old_names]

In [47]:
featuretable.columns = featuretable.columns.droplevel(0)
featuretable.columns = new_names
featuretable.head()

Unnamed: 0,index_meta,all_times_nhist_numpeaks_u,all_times_nhist_numpeaks_g,all_times_nhist_numpeaks_r,all_times_nhist_numpeaks_i,all_times_nhist_numpeaks_z,all_times_nhist_numpeaks_Y,all_times_nhist_peak_val_u,all_times_nhist_peak_val_g,all_times_nhist_peak_val_r,...,std_Y,weighted_average_u,weighted_average_g,weighted_average_r,weighted_average_i,weighted_average_z,weighted_average_Y,mwebv_meta,z_meta,zerr_meta
0,615,18.0,3.0,3.0,3.0,3.0,4.0,0.006097,0.00405,0.00405,...,292.182295,-17.061118,-212.397193,-102.220639,-101.206639,-54.744845,-59.688379,0.017,0.0,0.0
1,713,15.0,3.0,3.0,3.0,3.0,3.0,0.006491,0.004738,0.004738,...,7.030448,-3.500958,-1.322397,-1.030469,-1.382941,-1.407879,-1.876399,0.007,1.6267,0.2552
2,730,16.0,7.0,7.0,7.0,7.0,7.0,0.006176,0.00424,0.00424,...,13.201397,-0.016423,-0.03417,2.059833,2.988513,4.486335,5.05769,0.021,0.2262,0.0157
3,745,18.0,4.0,4.0,4.0,4.0,4.0,0.005318,0.004374,0.004374,...,25.822133,1.176322,3.652226,6.716857,12.514694,12.247387,8.760515,0.007,0.2813,1.1523
4,1124,18.0,3.0,3.0,3.0,3.0,4.0,0.006097,0.00405,0.00405,...,21.245772,0.82438,3.617169,7.842645,8.830427,8.463856,5.602845,0.024,0.2415,0.0176


In [48]:
#Rename the object Id column to enable a merge:
featuretable = featuretable.rename(columns={"index_meta":"object_id"})
featuretable['object_id'] = featuretable.object_id.astype('int32')

In [49]:
df = pd.merge(metadata,featuretable, on="object_id")
df.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,std_Y,weighted_average_u,weighted_average_g,weighted_average_r,weighted_average_i,weighted_average_z,weighted_average_Y,mwebv_meta,z_meta,zerr_meta
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,...,292.182295,-17.061118,-212.397193,-102.220639,-101.206639,-54.744845,-59.688379,0.017,0.0,0.0
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,...,7.030448,-3.500958,-1.322397,-1.030469,-1.382941,-1.407879,-1.876399,0.007,1.6267,0.2552
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,...,13.201397,-0.016423,-0.03417,2.059833,2.988513,4.486335,5.05769,0.021,0.2262,0.0157
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,...,25.822133,1.176322,3.652226,6.716857,12.514694,12.247387,8.760515,0.007,0.2813,1.1523
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,...,21.245772,0.82438,3.617169,7.842645,8.830427,8.463856,5.602845,0.024,0.2415,0.0176


In [50]:
#One KNN model will treat only extragalactic sources, and one will 
#treat only galactic sources, so I'll split them up here:
egalmask = df['hostgal_specz'] != 0
extragal = df[egalmask]
extragal.groupby('target').agg('count')

Unnamed: 0_level_0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,std_Y,weighted_average_u,weighted_average_g,weighted_average_r,weighted_average_i,weighted_average_z,weighted_average_Y,mwebv_meta,z_meta,zerr_meta
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,495,495,495,495,495,495,495,495,495,495,...,495,495,495,495,495,495,495,495,495,495
42,1193,1193,1193,1193,1193,1193,1193,1193,1193,1193,...,1193,1193,1193,1193,1193,1193,1193,1193,1193,1193
52,183,183,183,183,183,183,183,183,183,183,...,183,183,183,183,183,183,183,183,183,183
62,484,484,484,484,484,484,484,484,484,484,...,484,484,484,484,484,484,484,484,484,484
64,102,102,102,102,102,102,102,102,102,102,...,102,102,102,102,102,102,102,102,102,102
67,208,208,208,208,208,208,208,208,208,208,...,208,208,208,208,208,208,208,208,208,208
88,370,370,370,370,370,370,370,370,370,370,...,370,370,370,370,370,370,370,370,370,370
90,2313,2313,2313,2313,2313,2313,2313,2313,2313,2313,...,2313,2313,2313,2313,2313,2313,2313,2313,2313,2313
95,175,175,175,175,175,175,175,175,175,175,...,175,175,175,175,175,175,175,175,175,175


In [51]:
gal = df[~egalmask]
gal.groupby('target').agg('count')

Unnamed: 0_level_0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,...,std_Y,weighted_average_u,weighted_average_g,weighted_average_r,weighted_average_i,weighted_average_z,weighted_average_Y,mwebv_meta,z_meta,zerr_meta
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,151,151,151,151,151,151,151,151,151,0,...,151,151,151,151,151,151,151,151,151,151
16,924,924,924,924,924,924,924,924,924,0,...,924,924,924,924,924,924,924,924,924,924
53,30,30,30,30,30,30,30,30,30,0,...,30,30,30,30,30,30,30,30,30,30
65,981,981,981,981,981,981,981,981,981,0,...,981,981,981,981,981,981,981,981,981,981
92,239,239,239,239,239,239,239,239,239,0,...,239,239,239,239,239,239,239,239,239,239


In [68]:
#Before creating train/test splits, we need to remove or impute NaN values
#in galactic rows, we need to remove distmod, which is not calculated
gal = gal.drop('distmod', axis=1)
nans = pd.DataFrame(gal.isnull().sum(axis=0), columns=["count"])
nans[nans['count']>0]

Unnamed: 0,count
avg_double_to_single_step_u,202
avg_double_to_single_step_g,117
avg_double_to_single_step_r,115
avg_double_to_single_step_i,148
avg_double_to_single_step_z,1
avg_double_to_single_step_Y,148
fold2P_slope_10percentile_u,127
fold2P_slope_10percentile_g,105
fold2P_slope_10percentile_r,3
fold2P_slope_10percentile_i,9


In [69]:
#For now, I'll impute these with a mean
gal = gal.fillna(gal.mean())

In [70]:
#similar for extragalactic issues:
nans = pd.DataFrame(extragal.isnull().sum(axis=0), columns=["count"])
nans[nans['count']>0]

Unnamed: 0,count
avg_double_to_single_step_u,714
avg_double_to_single_step_g,324
avg_double_to_single_step_r,323
avg_double_to_single_step_i,399
avg_double_to_single_step_z,6
avg_double_to_single_step_Y,399
fold2P_slope_10percentile_u,269
fold2P_slope_10percentile_g,148
fold2P_slope_10percentile_r,11
fold2P_slope_10percentile_i,13


In [71]:
extragal = extragal.fillna(extragal.mean())

In [72]:
#Split X,Y
galX = gal.drop('target', axis=1)
galY = gal['target']
egalX = extragal.drop('target', axis=1)
egalY = extragal['target']

In [73]:
#Create Train/Test splits
galX_train, galX_test, galY_train, galY_test = train_test_split(
    galX, galY, test_size=0.5)

egalX_train, egalX_test, egalY_train, egalY_test = train_test_split(
    egalX, egalY, test_size=0.5)

In [74]:
#First, galactic objects
galknn = KNeighborsClassifier(n_neighbors=5)
galknn.fit(galX_train, galY_train)

galknn.score(galX_test, galY_test)

0.43250214961306965

In [76]:
#not great, let's try extragal
egalknn = KNeighborsClassifier(n_neighbors=5)
egalknn.fit(egalX_train, egalY_train)

egalknn.score(egalX_test, egalY_test)

0.3113685734974656