In [11]:
import numpy as np
from collections import Counter, OrderedDict
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

# Create Galactic KNN object

In [12]:
#read in our training features:
df = pd.read_csv('trainingfeats.csv')

In [13]:
#Filter to only galactic objects, which is the goal of this model
#per the code book, galactic objects have hostgal_specz == 0
egalmask = df['hostgal_specz'] != 0
gal = df[~egalmask]

In [14]:
#Our classes are imbalanced and we'll need to Create SMOTE object
sm = SMOTE(sampling_strategy='not majority')
#Drop cols I don't want to generate with SMOTE. 
#Things like position in the sky or object_Id are meaningless to simulate with SMOTE
Y = gal['target']
#remove 'target' if using test data, remove distmod if extragalactic (maybe?)
X = gal.drop(['object_id','target', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'distmod'], axis=1) 
#Create extra samples
X_new, Y_new = sm.fit_resample(X,Y)

In [15]:
#Re-add target and confirm balance
df = pd.DataFrame(X_new, columns=X.columns)
df['target'] = Y_new

In [16]:
#Cols chosen by lasso:
X2 = df[['std_u', 'min_u', 'mean_g', 'std_g', 'max_g', 'min_g', 'mean_r',
       'median_r', 'max_r', 'min_r', 'mean_i', 'median_i', 'std_i', 'max_i',
       'min_i', 'mean_z', 'median_z', 'std_z', 'max_z', 'min_z', 'mean_Y',
       'median_Y', 'std_Y', 'max_Y', 'min_Y']]

In [17]:
#train on whole dataset
galknn = KNeighborsClassifier(n_neighbors=2, weights='distance')
galknn.fit(X2, df['target'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='distance')

In [18]:
#read in our training features:
df = pd.read_csv('trainingfeats.csv')

In [19]:
egalmask = df['hostgal_specz'] != 0
egal = df[egalmask]

In [20]:
#Our classes are imbalanced and we'll need to Create SMOTE object
sm = SMOTE(sampling_strategy='not majority')
#Drop cols I don't want to generate with SMOTE. 
#Things like position in the sky or object_Id are meaningless to simulate with SMOTE
Y = egal['target']
#remove 'target' if using test data
X = egal.drop(['object_id','target', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'distmod'], axis=1) 
#Create extra samples
X_new, Y_new = sm.fit_resample(X,Y)

In [21]:
#Re-add target and confirm balance
df = pd.DataFrame(X_new, columns=X.columns)
df['target'] = Y_new

In [22]:
#Cols chosen by Lasso
X2 = df[['mean_u', 'max_u', 'min_u', 'mean_g', 'median_g', 'std_g', 'min_g',
       'mean_r', 'median_r', 'std_r', 'max_r', 'min_r', 'mean_i', 'median_i',
       'std_i', 'max_i', 'min_i', 'mean_z', 'median_z', 'std_z', 'max_z',
       'min_z', 'mean_Y', 'median_Y', 'std_Y', 'max_Y', 'min_Y']]

In [23]:
egalknn = KNeighborsClassifier(n_neighbors=2, weights='distance')
egalknn.fit(X2, df['target'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2, p=2,
           weights='distance')

# Read test file in chunks, predict, and output.

In [24]:
outputcols = ['class_6', 'class_15', 'class_16', 'class_42', 'class_52', 'class_53', 'class_62', 'class_64', 'class_65',
 'class_67', 'class_88', 'class_90', 'class_92', 'class_95', 'class_99']

import time

In [17]:
output = pd.DataFrame(columns=outputcols)
output.to_csv('output.csv')
f = open('output.csv', 'a')

#This variable is the 'unknown' threshold. If no probabilities are over this threshold, 
#the object will get assigned to the unknown class, 99.
galthresh99 = .6
egalthresh99 = .51
#if an object is getting assigned to class 99, this is the confidence level to give class 99.
#all other classes will be scaled proportionately down from their original values so the row sums to 1 (100%)
confidence99 = .75

start = time.time()
for chunk in pd.read_csv('trainingfeats.csv', chunksize=8000):
    #Split
    egalmask = chunk['hostgal_specz'] != 0
    egal = chunk[egalmask]
    gal = chunk[~egalmask]
    gal = gal[['std_u', 'min_u', 'mean_g', 'std_g', 'max_g', 'min_g', 'mean_r', \
       'median_r', 'max_r', 'min_r', 'mean_i', 'median_i', 'std_i', 'max_i',\
       'min_i', 'mean_z', 'median_z', 'std_z', 'max_z', 'min_z', 'mean_Y',\
       'median_Y', 'std_Y', 'max_Y', 'min_Y']]
    galin = gal.index
    egal = egal[['mean_u', 'max_u', 'min_u', 'mean_g', 'median_g', 'std_g', 'min_g',\
       'mean_r', 'median_r', 'std_r', 'max_r', 'min_r', 'mean_i', 'median_i',\
       'std_i', 'max_i', 'min_i', 'mean_z', 'median_z', 'std_z', 'max_z',\
       'min_z', 'mean_Y', 'median_Y', 'std_Y', 'max_Y', 'min_Y']]
    egalin = egal.index
    #Make Predictions
    galout = pd.DataFrame(galknn.predict_proba(gal),columns=sorted([6,16,53,65,92]))
    egalout = pd.DataFrame(egalknn.predict_proba(egal),columns=[15,42,52,62,64,67,88,90,95])
    galout.index = galin
    egalout.index=egalin
    
    #Add 99 to galactic
    galout['max_value'] = galout.max(axis=1)
    galout[991] = np.where(galout.max_value < galthresh99,confidence99,0)
    #Don't need the max value column anymore
    galout = galout.drop('max_value', axis=1)
    #loop through the columns, scaling down values for rows where class 99 is nonzero.
    for col in galout.columns:
        galout[col] = galout[col]*(1-galout[991])
    
    #Add 99 to extragalactic
    egalout['max_value'] = egalout.max(axis=1)
    egalout[992] = np.where(egalout.max_value < egalthresh99,confidence99,0)
    #Don't need the max value column anymore
    egalout = egalout.drop('max_value', axis=1)
    #loop through the columns, scaling down values for rows where class 99 is nonzero.
    for col in egalout.columns:
        egalout[col] = egalout[col]*(1-egalout[992])
    
    #Common columns
    for x in [15,42,52,62,64,67,88,90,95,992]:
        galout[x] = 0.0
    galout = galout.reindex(sorted(galout.columns), axis=1)

    for x in [6,16,53,65,92,991]:
        egalout[x] = 0.0
    egalout = egalout.reindex(sorted(egalout.columns), axis=1)
    
    results = pd.concat([galout,egalout], sort=True).sort_index()
    results[99] = results[991] + results[992]
    results = results.drop([991,992], axis=1)
    results.index = chunk.object_id
    break
    results.to_csv(f,header=False)
    



f.close()
end = time.time()
print(end-start)

Unnamed: 0_level_0,6,15,16,42,52,53,62,64,65,67,88,90,92,95,99
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
730,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1632,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


0.986361026763916


In [2]:
test = pd.read_csv('test_set.csv', nrows = 15000)
test.head()

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,13,59798.3205,2,-1.299735,1.357315,0
1,13,59798.3281,1,-2.095392,1.148654,0
2,13,59798.3357,3,-0.923794,1.763655,0
3,13,59798.3466,4,-4.009815,2.602911,0
4,13,59798.3576,5,-3.403503,5.367328,0


In [3]:
test[test.passband==0].groupby(['object_id','passband'])['flux'].agg(['mean','std','max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,max
object_id,passband,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13,0,0.402818,2.060904,5.81627
14,0,0.315396,2.372227,6.371965
17,0,0.095728,2.770631,6.056491
23,0,0.010791,2.062264,3.919783
34,0,1.032765,4.103085,19.174976
35,0,-0.170596,2.028422,6.212695
43,0,-0.179491,1.93096,5.048397
50,0,-0.272357,2.014025,5.090174
60,0,-0.231007,2.322048,5.824199
69,0,0.526555,2.087396,4.949674


In [10]:
test13 = test[(test.object_id == 13) & (test.passband==0)]
print(test13['flux'].std())
print(np.std(test13['flux'].values))


2.0609039179995206
2.046542043250537
