# Using logistic Regression on Toydata to get a high AMS

In [455]:
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn import linear_model as linMod

We generate well diversible Toydata in form of a 4-dimensional Vector

Data shall have the form of $[w,y,x_1,x_2]$ where

$w$ is a weight in the intervall $[0,1)$       
$y$ is the label "0" for "background" or "1" for "signal"       
$x_n$ are randomly generated features with respect to the label

In [772]:
def generateFeature(label, mu_s, mu_b, sigma_s=5, sigma_b=5):
    if label is 1:
        mu = mu_s
        sigma = sigma_s
    else:
        mu = mu_b
        sigma = sigma_b
    return np.random.normal(mu,sigma)

Approximate Median Significance (AMS) defined as:

$$AMS = \sqrt{2 { (s + b + b_r) log[1 + (s/(b+b_{reg}))] - s}}$$     
        
where $b_{reg} = 10$ is a regulization term (set by the contest),      
$b = \sum_{i=1}^{n} w_i, y_i=0$ is sum of weighted background (incorrectly classified as signal),           
$s = \sum_{i=1}^{n} w_i, y_i=1$ is sum of weighted signals (correctly classified as signal),           
$log$ is natural logarithm

In [773]:
def calcAMS(s,b):
    br = 10.0
    radicand = 2 *( (s+b+br) * math.log (1.0 + s/(b+br)) -s)
    if radicand < 0:
        print('radicand is negative. Exiting')
        exit()
    else:
        return math.sqrt(radicand)

In [774]:
def calcWeightSums(weights,preds,labels):
    s = 0
    b = 0
    for j in list(range(0,len(preds))):
        pred = preds[j]
        label = labels[j]
        weight = weights[j]
        if pred > 0.:
            if label > 0.:
                s += weight
            else:
                b += weight
    return s,b

actually generate data

In [778]:
n = 100000 #toydata shall have n vectors with 5 dimensions
s_prob = 0.05 #probability for signal-label
weights = np.random.rand(n) #random values will be used as weights for evaluation later
labels = np.zeros(n)
x_1 = np.zeros(n)
x_2 = np.zeros(n)

for i in range(0,n):
    if weights[i] <= s_prob:
        label = 1
    else:
        label = 0
    labels[i] = label
    x_1[i]=generateFeature(label,mu_s=5,mu_b=20)
    x_2[i]=generateFeature(label,mu_s=5,mu_b=25)

visualize

In [779]:
plt.scatter(x_1, x_2, edgecolor="", c=labels, alpha=0.5)
plt.show()

In [780]:
def splitList(xList,n):
    aList = xList[:n]
    bList = xList[n:]
    return aList,bList

split toydata into training- and testset for the classifier

In [781]:
n_train = int(n/10)

train_x_1,test_x_1 = splitList(x_1,n_train)
train_x_2,test_x_2 = splitList(x_2,n_train)
train_labels,test_labels = splitList(labels,n_train)
test_weights = splitList(weights,n_train)[1]

For Comparison, we calculate the best possible AMS (case: every signal correctly detected)

In [782]:
def calcMaxAMS(weights,labels):
    s,b = calcWeightSums(weights,labels,labels)
    ams = calcAMS(s,b)
    print("Maximum AMS possible with this Data:", ams)
    return ams

In [783]:
calcMaxAMS(test_weights,test_labels)

Maximum AMS possible with this Data: 19.816158976625637


19.816158976625637

we initialize the Logistic Regression Classifier, shape the input-data and fit the model

In [784]:
logReg = linMod.LogisticRegression(C=1e5)

train_x = np.array([train_x_1,train_x_2]).transpose()
test_x = np.array([test_x_1,test_x_2]).transpose()
train_labels = np.array(train_labels).transpose()
test_labels = np.array(test_labels).transpose()

logReg.fit(train_x,train_labels)

logReg.sparsify()

predProb = logReg.predict_proba(test_x)
pred = logReg.predict(test_x)
score = logReg.score(test_x,test_labels)

print("Score:", score)

(10000, 2)
Score: 0.997644444444


In [785]:
s,b = calcWeightSums(test_weights,pred,test_labels)
calcAMS(s,b)

12.529617976701001