In [36]:
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [37]:
import os
import random
import torch

def set_seed(seed: int):
    """
    Sets random number generator seeds for PyTorch and NumPy to ensure reproducibility of results.
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def save_outputs(outputs: np.ndarray, file_name: str):
    """
    Save the model outputs to a file.
    :param outputs: Model outputs, np.ndarry with shape (10'000,) and values in {-1, 1}
    """
    indices = np.arange(1, outputs.shape[0] + 1)
    combined = np.column_stack((indices, outputs))

    np.savetxt(
        f'{file_name}.csv', combined, delimiter=',', fmt='%d', header="Id,Prediction", comments='')

In [38]:
models = [0, 1, 2, 3, 4, 5]
results = {}
THRESHOLD = 0.5
seed = 42
set_seed(seed)

In [39]:
eval_labels = np.load('drive/MyDrive/ensemble-6/eval_labels.npy')
eval_outputs = np.load('drive/MyDrive/ensemble-6/eval_outputs.npy')[:, models]
test_outputs = np.load('drive/MyDrive/ensemble-6/test_outputs.npy')[:, models]

eval_labels.shape, eval_outputs.shape, test_outputs.shape

((125000,), (125000, 6), (10000, 6))

In [40]:
trainX, testX, trainY, testY = train_test_split(eval_outputs, eval_labels, test_size=0.5, random_state=seed)
print(trainX.shape, testX.shape)

(62500, 6) (62500, 6)


In [41]:
predictions = np.where(np.mean(testX, axis=1) >= THRESHOLD, 1, 0)
print(f"averaging accuracy: {accuracy_score(predictions, testY):.3%}")

results["avg"] = np.mean(test_outputs, axis=1)
results["avg"] = np.where(results["avg"] >= THRESHOLD, 1, -1)

averaging accuracy: 92.206%


In [42]:
names = ["linear", "logistic", "ridge"]
models = [LinearRegression(), LogisticRegression(), Ridge()]

for name, model in zip(names, models):
  model.fit(trainX, trainY)
  print(f"\n[{name}]\nModel Importance Weights:\n{model.coef_}")

  predictions = model.predict(testX)
  predictions = np.where(predictions >= THRESHOLD, 1, 0)

  print(f"Accuracy: {accuracy_score(predictions, testY):.3%}")

  results[name] = model.predict(test_outputs)
  results[name] = np.where(results[name] >= THRESHOLD, 1, -1)
  diff = np.sum(results["avg"] != results[name])
  print(f"Predictions Changed: {diff}")


[linear]
Model Importance Weights:
[0.51401335 0.08768523 0.05689725 0.21484447 0.03155425 0.0706476 ]
Accuracy: 92.403%
Predictions Changed: 178

[logistic]
Model Importance Weights:
[[3.71535659 0.61027907 0.22358713 1.33214774 0.40976643 0.52610315]]
Accuracy: 92.422%
Predictions Changed: 186

[ridge]
Model Importance Weights:
[0.5131298  0.08755165 0.05727666 0.21464202 0.03014191 0.07285156]
Accuracy: 92.406%
Predictions Changed: 178


In [43]:
for name in names:
  save_outputs(results[name], name)