In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
import toolbox as tb
import kaggleData as kD

In [None]:
#toydata shall have n vectors with 5 dimensions
n = 100000
#probability for signal-label
s_prob = 0.05
dim = 4
data = tb.createToyData(n,dim,s_prob)
weights = data[:,0]
labels = data[:,1]
x_1 = data[:,2]
x_2 = data[:,3]

%pylab inline
plt.scatter(x_1, x_2, edgecolor="", c=labels, alpha=0.5)

For Comparison, we calculate the best possible AMS    
(case: every signal correctly detected)

In [None]:
tb.calcMaxAMS(weights,labels);

prepare (=normalize) data

In [None]:
X=data[:,2:4]
X /= np.max(np.abs(X),axis=0)

In [None]:
n_train = int(n/10)

train_X,test_X = tb.splitList(X,n_train)
train_labels,test_labels = tb.splitList(labels,n_train)
test_weights = tb.splitList(weights,n_train)[1]

In [None]:
tb.calcMaxAMS(test_weights,test_labels);

classify with knN

In [None]:
neigh = neighbors.KNeighborsClassifier(n_neighbors=5,n_jobs=4)
neigh.fit(train_X,train_labels)

In [None]:
pred = neigh.predict(test_X)

In [None]:
%pylab inline
plt.scatter(test_X[:,0], test_X[:,1], edgecolor="", c=pred, alpha=0.5)

In [None]:
s,b = tb.calcWeightSums(test_weights,pred,test_labels)
print("AMS:",tb.calcAMS(s,b))

In [2]:
def kNN(train_data,train_labels,test_data,k=5,softPred = True,returnClassifier = False):
    neigh = neighbors.KNeighborsClassifier(k,weights='distance',n_jobs=4)
    neigh.fit(train_data,train_labels)
    if softPred is True:
        pred = neigh.predict_proba(test_data)
    else: 
        pred =neigh.predict(test_data)
    return pred

Let's use Kaggle-Data:

In [3]:
(header,
 b_data,
 b_weights,
 b_labels,
 train_data,
 train_weights,
 train_labels) = kD.getWholeDataSet(kSet="b")

In [4]:
(v_data,
 v_weights,
 v_labels) = kD.getWholeDataSet(kSet="v")[1:4]

In [5]:
test_data = np.vstack([v_data,b_data])
test_weights = np.hstack([v_weights,b_weights])
test_labels = np.hstack([v_labels,b_labels])

In [6]:
(b_eventList,
 v_eventList,
 train_eventList,
 test_eventList) = kD.getBVFeatureSets("EventId",header,b_data,v_data,test_data,train_data)

In [7]:
train_labels = np.array(train_labels).transpose()

features with good properties

In [8]:
def getBVDataSets(featList,header,b_data,v_data,test_data,train_data):
    b_X = np.zeros((len(b_data),len(featList)))
    v_X = np.zeros((len(v_data),len(featList)))
    test_X = np.zeros((len(test_data),len(featList)))
    train_X = np.zeros((len(train_data),len(featList)))
    for i in range(0,len(featList)):
        (b_X[:,i],
         v_X[:,i],
         train_X[:,i],
         test_X[:,i]) = kD.getBVFeatureSets(featList[i],header,b_data,v_data,test_data,train_data)

    return b_X,v_X,train_X,test_X

In [15]:
#all but events, pls
train_X = train_data[:,1:]
test_X = test_data[:,1:]
b_X = b_data[:,1:]
v_X = v_data[:,1:]

In [16]:
pred_b = kNN(train_X,train_labels,b_X,k=20)
pred_v = kNN(train_X,train_labels,v_X,k=20)
pred = kNN(train_X,train_labels,test_X,k=20)

In [17]:
true_pred_b = tb.customThreshold(pred_b[:,1],0.8)
true_pred_v = tb.customThreshold(pred_v[:,1],0.8)

In [19]:
s,b = tb.calcWeightSums(b_weights,true_pred_b,b_labels)
print("public AMS:",tb.calcAMS(s,b))
s,b = tb.calcWeightSums(v_weights,true_pred_v,v_labels)
print("private AMS:",tb.calcAMS(s,b))

public AMS: 2.7124556254914878
private AMS: 2.750770249685567


In [20]:
tb.createSolutionFile(test_eventList,pred[:,1],0.8,"F:\BA_git\Data\Solutions\solution_kNN_all.csv")

In [10]:
featList = ["DER_mass_MMC",
            "DER_mass_transverse_met_lep",
            "DER_mass_vis",
            "DER_met_phi_centrality",
            "DER_pt_ratio_lep_tau",
            "PRI_tau_pt",
            "DER_pt_h"]
b_X,v_X,train_X,test_X = getBVDataSets(featList,header,b_data,v_data,test_data,train_data)

In [None]:
#public score
tb.calcMaxAMS(b_weights,b_labels);

In [None]:
#private score
tb.calcMaxAMS(v_weights,v_labels);

In [None]:
neigh = neighbors.KNeighborsClassifier(n_neighbors=20,n_jobs=4)
neigh.fit(train_X,train_labels)
pred_b = neigh.predict(b_X)
pred_v = neigh.predict(v_X)
pred_test = neigh.predict(test_X)

In [None]:
s,b = tb.calcWeightSums(b_weights,pred_b,b_labels)
print("public AMS:",tb.calcAMS(s,b))

In [None]:
s,b = tb.calcWeightSums(v_weights,pred_v,v_labels)
print("private AMS:",tb.calcAMS(s,b))

In [9]:
def getBestN(n_range,train_X,test_X,test_weights,test_labels,train_labels):
    maxAMS = 0
    bestN = 0
    neigh = neighbors.KNeighborsClassifier(weights='distance',n_jobs=4)
    for n in n_range:
        neigh.set_params(n_neighbors=n)
        neigh.fit(train_X,train_labels)
        predProb = neigh.predict_proba(test_X)
        pred = tb.customThreshold(predProb[:,1],0.8)
        s,b = tb.calcWeightSums(test_weights,pred,test_labels)
        ams = tb.calcAMS(s,b)
        print("n=", n, "| AMS:",ams)
        if ams > maxAMS:
            maxAMS = ams
            bestN = n
    return maxAMS,bestN

In [None]:
featList = ['DER_mass_MMC','DER_mass_transverse_met_lep','DER_mass_vis']
(new_header,new_test_data,new_test_weights,new_test_labels) = kD.getCustomDataSet(featList,kSet = "v")
(new_header,new_train_data,new_train_weights,new_train_labels) = kD.getCustomDataSet(featList,kSet = "t")

In [None]:
maxAMS, bestN = getBestN([100,200],new_train_data,new_test_data,new_test_weights,new_test_labels,new_train_labels)
print("Best AMS:", maxAMS, "with n =", bestN)

Private AMS = 2.310879936427514   => Rank 1389!

In [11]:
pred_b = kNN(train_X,train_labels,b_X,k=200)
pred_v = kNN(train_X,train_labels,v_X,k=200)
pred = kNN(train_X,train_labels,test_X,k=200)

In [12]:
true_pred_b = tb.customThreshold(pred_b[:,1],0.8)
true_pred_v = tb.customThreshold(pred_v[:,1],0.8)

In [13]:
s1,b1 = tb.calcWeightSums(b_weights,true_pred_b,b_labels)
print("public AMS:",tb.calcAMS(s1,b1))
s2,b2 = tb.calcWeightSums(v_weights,true_pred_v,v_labels)
print("private AMS:",tb.calcAMS(s2,b2))

public AMS: 3.1096896180800586
private AMS: 3.1689810059694437


Optimized AMS ~3.168 with threshold 0.8 => rank 999

In [None]:
featList = ["DER_mass_MMC",
            "DER_mass_transverse_met_lep",
            "DER_mass_vis",
            "DER_met_phi_centrality",
            "DER_pt_ratio_lep_tau",
            "PRI_tau_pt",
            "DER_pt_h"]
b_X,v_X,train_X,test_X = getBVDataSets(featList,header,b_data,v_data,test_data,train_data)

In [None]:
norm_train_X = np.copy(train_X)
norm_b_X = np.copy(b_X)
norm_v_X = np.copy(v_X)
norm_test_X = np.copy(test_X)

for i in range(0,len(featList)):
    norm_train_X[:,i] /= np.mean(np.abs(train_X[:,i]),axis=0)
    norm_b_X[:,i] /= np.mean(np.abs(b_X[:,i]),axis=0)
    norm_v_X[:,i] /= np.mean(np.abs(v_X[:,i]),axis=0)
    norm_test_X[:,i] /= np.mean(np.abs(test_X[:,i]),axis=0)

In [None]:
pred_b = kNN(norm_train_X,train_labels,norm_b_X,k=200)
pred_v = kNN(norm_train_X,train_labels,norm_v_X,k=200)
pred = kNN(norm_train_X,train_labels,norm_test_X,k=200)

In [None]:
true_pred_b = tb.customThreshold(pred_b[:,1],0.8)
true_pred_v = tb.customThreshold(pred_v[:,1],0.8)

In [None]:
s1,b1 = tb.calcWeightSums(b_weights,true_pred_b,b_labels)
print("public AMS:",tb.calcAMS(s1,b1))
s2,b2 = tb.calcWeightSums(v_weights,true_pred_v,v_labels)
print("private AMS:",tb.calcAMS(s2,b2))

In [None]:
maxAMS, bestN = getBestN(np.arange(1,100),norm_train_X,norm_v_X,v_weights,v_labels,train_labels)
print("Best AMS:", maxAMS, "with n =", bestN)

In [None]:
np.mean(np.abs(train_X[:,1]),axis=0)

In [None]:
errorList = ['DER_deltaeta_jet_jet',
             'DER_mass_jet_jet',
             'DER_prodeta_jet_jet',
             'DER_lep_eta_centrality',
             'PRI_jet_leading_pt',
             'PRI_jet_leading_eta',
             'PRI_jet_leading_phi',
             'PRI_jet_subleading_pt',
             'PRI_jet_subleading_eta',
             'PRI_jet_subleading_phi']