In [None]:
import csv
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
%load_ext autoreload
%autoreload 2
from tqdm.notebook import tqdm

from helpers import *
from implementations import *
from plots import *

# Loading Higgs Model data

In [None]:
from pathlib import Path
DATA = Path().resolve() / "data"
print("Looking for the data in", DATA)
_,  tx_submission,  ids_submission  = load_csv_data(DATA / "test.csv")
y, tx, _ = load_csv_data(DATA / "train.csv")

# Split the data into training and testing
tx_train, y_train, tx_test, y_test = split_data(tx, y, 0.8)

# Visualizing the data

In [None]:
plot_feature_histograms(tx_train, y_train)

In [None]:
def feature_engineering(tx, mean=None, std=None, enable=True):
    if not enable:
        return tx

    # Feature 22 is an integer, on which many other features are based.
    # We add 4 new features for each of the 4 possible values of feature 22.
    tx_22 = tx[:, [22]]
    tx_22 = np.repeat(tx_22, 4, axis=1)
    tx_22 = tx_22 == [0, 1, 2, 3]
    tx = np.concatenate((tx, tx_22), axis=1)

    # Some features look like the exponential of some variable, so
    # we add a features that are the log of them
    tx_log = np.log(tx[:, [2, 9, 10, 13, 16, 19, 21]])
    tx = np.concatenate((tx, tx_log), axis=1)
    
    # Some features are angles, so we add their sine and cosine
    angles = np.array([14, 15, 17, 18, 20])
    tx = np.concatenate((tx, np.sin(tx[:, angles]), np.cos(tx[:, angles])), axis=1)

    # Feature 29 is zero when f22 < 2 because it is the sum
    # of the undefined features. Should we put it at -999?
    # Should we split it into two features?

    
    # Some features are undefined if f_22 < 2
    # undef_from_22 = [4, 6, 12, 23, 24, 25, 26, 27, 28]
    # new = tx[:, undef_from_22]
    # new[new == -999] = 0
    # tx = np.concatenate((tx, new), axis=1)

    # We add the powers of each feature
    powers = [2, 3, 4]
    # features = [1, 2, 3, 5, 7, 8, 9, 10, 11, 13, 16, 19, 21, 29, 
    #     34, 35, 36, 37, 38, 39, 40]
    features = [1, 3, 5, 7, 8, 11, 29, 
        34, 35, 36, 37, 38, 39, 40]
    for p in powers:
        tx = np.concatenate((tx, tx[:, features] ** p), axis=1)

    # We add the product of each pair of features
    for f1 in features:
        for f2 in features:
            if f1 < f2:
                tx = np.concatenate((tx, tx[:, [f1]] * tx[:, [f2]]), axis=1)

    # Normalisation

    assert (mean is None) == (std is None), f"{mean=} {std=}"
    if mean is None:
        # We don't normalize every feature.
        # We don't normalise features that are discrete, angular
        # Or where the 0 seems to be a special value (f_11)
        dont_normalise = {11, 14, 15, 17, 18, 20, 22}
        # dont_normalise.update(np.where(np.min(tx_train, axis=0) >= 0)[0])
        dont_normalise = list(dont_normalise)

        mean = np.mean(tx, axis=0)
        std = np.std(tx, axis=0)
        mean[dont_normalise] = 0
        std[dont_normalise] = 1

    std[std == 0] = 1  # The feature is constant
    tx = (tx - mean) / std

    return tx, mean, std

In [None]:
plot_feature_histograms(feature_engineering(tx_train)[0], y_train)

### Outliers

In [None]:
def remove_outliers(tx, y, threshold=50, just_plot=False, enable=True):
    """Remove outliers from the dataset.
    Outliers are defined as points that are more than threshold times the standard
    deviation away from the mean.
    """

    if not enable:
        return tx, y
        
    weirdness = np.sum(np.abs(normalize_features(tx)[0]), axis=1)
    print(f'Number of samples above {threshold} deviations in total: {np.sum(weirdness > threshold)}')

    if just_plot:
        plt.figure(figsize=(20, 10))
        plt.hist(weirdness, bins=100)
        plt.xlabel('sum of deviations from the mean')
        plt.ylabel('number of samples')
        plt.title('Histogram of weirdness')
        plt.show()
    else:
        tx = tx[weirdness < threshold]
        y = y[weirdness < threshold]
        return tx, y

remove_outliers(tx_train, y_train, just_plot=True)


# Training one model

In [None]:
def predict_labels(weights, tx):
    """Generates class predictions given weights, and a test data matrix"""
    y_pred = np.einsum('rf,f->r', tx, weights)
    y_pred[np.where(y_pred <= 0)] = -1
    y_pred[np.where(y_pred > 0)] = 1
    return y_pred

def accuracy(y, tx, predictor):
    """Compute the accuracy of the model."""
    y_pred = predictor(tx)
    return np.sum(y_pred == y) / len(y)

def show_accuracy(y, tx, y_test, tx_test, predictor):
    print(f"Accuracy on training set: {accuracy(y, tx, predictor):.4f}")
    print(f"Accuracy on test set: {accuracy(y_test, tx_test, predictor):.4f}")

In [None]:
RM_OUTLIERS = 1
ADD_FEATURES = 1

tx, y_clean = remove_outliers(tx_train, y_train, threshold=50, enable=RM_OUTLIERS)
tx_clean, mean, std = feature_engineering(tx, enable=ADD_FEATURES)
tx_test_clean, _, _ = feature_engineering(tx_test, mean, std, enable=ADD_FEATURES)


In [None]:
params = {
    'max_iters': 300,
    'gamma': 0.01,
    'lambda_': 0.001,
    'batch_size': 400,
    'initial_w': np.random.randn(tx_clean.shape[1]),
}
weigths, losses = mean_squared_error_sgd(y_clean, tx_clean, **params, return_history=True)

In [None]:
plt.figure(figsize=(3, 3))
smooth_loss = np.convolve(losses[10000:], np.ones(1000) / 1000, mode='valid')
plt.plot(np.linspace(0, params['max_iters'], len(smooth_loss)), smooth_loss)
show_accuracy(y_clean, tx_clean, y_test, tx_test_clean, lambda tx: predict_labels(weigths[-1], tx))

# Training multiple models
Depending on feature 22

In [None]:
# Showing accuracy depending on the value of feature 22
for f_22 in range(4):
    mask = tx_clean[:, 22] == f_22
    acc = accuracy(y_clean[mask], tx_clean[mask], lambda tx: predict_labels(weigths[-1], tx))
    print(f"Accuracy for f_22 = {f_22}: {acc:.4f} ({np.sum(mask)} samples)")

In [None]:
models = []
normalisation = []
losses = []
for f_22 in range(4):
    mask = tx_train[:, 22] == f_22

    tx, y_clean = remove_outliers(tx_train[mask], y_train[mask], threshold=50, enable=1)
    tx_clean, mean, std = feature_engineering(tx, enable=ADD_FEATURES)
    normalisation.append((mean, std))

    params['initial_w'] = np.random.randn(tx_clean.shape[1])
    model, loss = mean_squared_error_sgd(y_clean, tx_clean, **params)
    models.append(model)
    losses.append(loss)

In [None]:
def predict_labels_22(models, tx):
    y_pred = np.zeros(tx.shape[0])
    for f_22, w in enumerate(models):
        mask = tx[:, 22] == f_22
        y_pred[mask] = predict_labels(w, tx[mask])
    return y_pred

# Normalising the test set, but with the 4 different normalisations
def normalise_22(tx, normalisations):
    nb_features = feature_engineering(tx[[0],:])[0].shape[1]
    result = np.zeros((tx.shape[0], nb_features))
    for f_22, (mean, std) in enumerate(normalisation):
        mask = tx[:, 22] == f_22
        result[mask], _, _ = feature_engineering(tx[mask], mean, std, enable=ADD_FEATURES)
    return result

tx_test_22 = normalise_22(tx_test, normalisation)
tx_train_22 = normalise_22(tx_train, normalisation)
show_accuracy(y_train, tx_train_22, y_test, tx_test_22, lambda tx: predict_labels_22(models, tx))

# Predicting the unkown labels

In [None]:
tx_submission_2 = normalise_22(tx_submission, normalisation)
y_submission = predict_labels_22(models, tx_submission_2)
create_csv_submission(ids_submission, y_submission, "second-attempt.csv")

# Comparision of models

In [None]:
# comparing the accuracy of the two models, depending on the value of feature 22
for f_22 in range(4):
    mask = tx_test[:, 22] == f_22
    acc_22 = accuracy(y_test[mask], tx_test_22[mask], lambda tx: predict_labels_22(models, tx))
    acc = accuracy(y_test[mask], tx_test_clean[mask], lambda tx: predict_labels(weigths[-1], tx))
    print(f"Accuracy for f_22 = {f_22}: {acc_22:.4f} vs {acc:.4f} ({np.sum(mask)} samples)")

# total accuracy
acc_22 = accuracy(y_test, tx_test_22, lambda tx: predict_labels_22(models, tx))
acc = accuracy(y_test, tx_test_clean, lambda tx: predict_labels(weigths[-1], tx))
print(f"Total accuracy: {acc_22:.4f} vs {acc:.4f}")