In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
import os
import random
import torch

def set_seed(seed: int):
    """
    Sets random number generator seeds for PyTorch and NumPy to ensure reproducibility of results.
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def save_outputs(outputs: np.ndarray, file_name: str):
    """
    Save the model outputs to a file.
    :param outputs: Model outputs, np.ndarry with shape (10'000,) and values in {-1, 1}
    """
    indices = np.arange(1, outputs.shape[0] + 1)
    combined = np.column_stack((indices, outputs))

    np.savetxt(
        f'{file_name}.csv', combined, delimiter=',', fmt='%d', header="Id,Prediction", comments='')

In [None]:
models = [0, 1, 2, 3, 4, 5, 6]
results = {}
THRESHOLD = 0.5
seed = 42
set_seed(seed)

In [None]:
eval_labels = np.load('drive/MyDrive/ensemble-6/eval_labels.npy')
eval_outputs = np.load('drive/MyDrive/ensemble-6/eval_outputs.npy')[:, models]
test_outputs = np.load('drive/MyDrive/ensemble-6/test_outputs.npy')[:, models]

eval_labels.shape, eval_outputs.shape, test_outputs.shape

((125000,), (125000, 7), (10000, 7))

In [None]:
trainX, testX, trainY, testY = train_test_split(eval_outputs, eval_labels, test_size=0.2, random_state=seed)
print(trainX.shape, testX.shape)

(100000, 7) (25000, 7)


In [None]:
predictions = np.where(np.mean(testX, axis=1) >= THRESHOLD, 1, 0)
print(f"averaging accuracy: {accuracy_score(predictions, testY):.3%}")

results["avg"] = np.mean(test_outputs, axis=1)
results["avg"] = np.where(results["avg"] >= THRESHOLD, 1, -1)

averaging accuracy: 92.308%


In [None]:
names = ["linear", "logistic", "ridge"]
models = [LinearRegression(), LogisticRegression(), Ridge()]

for name, model in zip(names, models):
  model.fit(trainX, trainY)
  print(f"\n[{name}]\nModel Importance Weights:\n{model.coef_}")

  predictions = model.predict(testX)
  predictions = np.where(predictions >= THRESHOLD, 1, 0)

  print(f"Accuracy: {accuracy_score(predictions, testY):.3%}")

  results[name] = model.predict(test_outputs)
  results[name] = np.where(results[name] >= THRESHOLD, 1, -1)
  diff = np.sum(results["avg"] != results[name])
  print(f"Predictions Changed: {diff}")


[linear]
Model Importance Weights:
[0.47138804 0.06074461 0.07651936 0.2139681  0.0399704  0.05053887
 0.06261042]
Accuracy: 92.432%
Predictions Changed: 153

[logistic]
Model Importance Weights:
[[3.03838988 0.42381916 0.35648013 1.32799774 0.39109272 0.3015574
  0.99036067]]
Accuracy: 92.444%
Predictions Changed: 153

[ridge]
Model Importance Weights:
[0.46820664 0.06034589 0.07678206 0.2136854  0.04296731 0.04759704
 0.06627076]
Accuracy: 92.420%
Predictions Changed: 152


In [None]:
for name in names:
  save_outputs(results[name], name)