In [None]:
# IMPORT DATA #
import numpy as np

# features consists of the EventID and the 30 features
features = np.loadtxt("atlas-higgs-challenge-2014-v2.csv", delimiter=",", skiprows=1, usecols=range(0,31))

# labels consists of the EventID, a weight, and a label
# the weights are used for the unnormalized true positives and false positive rates in the AMS calculation
labels = np.loadtxt("atlas-higgs-challenge-2014-v2.csv", delimiter=",", skiprows=1, usecols=(0,31,32), dtype=str)


In [None]:
# check out what the data looks like
print(features[0])
print(labels[0])
print(labels.shape)
print(features.shape)

In [None]:
# PREPROCESSING #
from sklearn.preprocessing import StandardScaler

# Normalize feature values
# scale all but first column (EventID)
scaler = StandardScaler().fit(features[:,1:])
temp_features = scaler.transform(features[:,1:])

# link EventIDs with scaled features
scaled_features = features
for sample in range(scaled_features.shape[0]):
    scaled_features[sample, 1:] = temp_features[sample]

# check out how it looks now
print(scaled_features[1])
print(scaled_features.shape)

In [None]:
from sklearn.model_selection import train_test_split

# split to test and train sets
features_train, features_test, labels_train, labels_test = train_test_split(scaled_features, labels, test_size=0.2)

In [None]:
# check the split data
print(features_train[0])
print(labels_train[0])
print(features_test.shape)
print(labels_test.shape)

In [None]:
# create solution submission for AMS metric
import csv

# the provided AMS calculation requires a csv file with each sample's: EventID, Class, Weight
with open('solution.csv', 'w', newline='') as solution_file:
    headings = ['EventId', 'Class', 'Weight']
    writer = csv.DictWriter(solution_file, fieldnames=headings)
    writer.writeheader()
    for sample in range(labels_test.shape[0]):
        writer.writerow({"EventId": labels_test[sample,0], "Class": labels_test[sample,2], "Weight": labels_test[sample,1]})

In [None]:
# change labels from string to binary
for label in range(labels_train.shape[0]):
    if labels_train[label,-1] == 'b':
        labels_train[label,-1] = 0
    else:
        labels_train[label,-1] = 1

for label in range(labels_test.shape[0]):
    if labels_test[label,-1] == 'b':
        labels_test[label,-1] = 0
    else:
        labels_test[label,-1] = 1

In [None]:
# check the data again
print(labels_test[0])
print(labels_train[0])

In [None]:
# LOGISTIC REGRESSION #
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='newton-cholesky')
model.fit(features_train[:,1:], labels_train[:,-1])
predictions = model.predict(features_test[:,1:])

In [None]:
predictions.shape

In [None]:
# change labels back to char, 's' or 'b'
labels_pred = [0] * predictions.shape[0]
for label in range(predictions.shape[0]):
    if predictions[label] == '0':
        labels_pred[label] = 'b'
    else:
        labels_pred[label] = 's'
print(labels_pred)

In [None]:
# create prediction submission for AMS metric calculation
with open('submission.csv', 'w', newline='') as submission_file:
    headings = ['EventId', 'RankOrder', 'Class']
    writer = csv.DictWriter(submission_file, fieldnames=headings)
    writer.writeheader()
    for sample in range(labels_test.shape[0]):
        writer.writerow({"EventId": labels_test[sample,0], "RankOrder": sample, "Class": labels_pred[sample]})

In [None]:
# calculate AMS
import HiggsBosonCompetition_AMSMetric_rev1 as ams
ams.AMS_metric("solution.csv", "submission.csv")