In [9]:
!pip install scikit-learn



In [10]:
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [11]:
import os
import random
import torch

def set_seed(seed: int):
    """
    Sets random number generator seeds for PyTorch and NumPy to ensure reproducibility of results.
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


def save_outputs(outputs: np.ndarray, file_name: str):
    """
    Save the model outputs to a file.
    :param outputs: Model outputs, np.ndarry with shape (10'000,) and values in {-1, 1}
    """
    indices = np.arange(1, outputs.shape[0] + 1)
    combined = np.column_stack((indices, outputs))

    np.savetxt(
        f'{file_name}.csv', combined, delimiter=',', fmt='%d', header="Id,Prediction", comments='')

In [12]:
models = [0, 1, 3, 5, 6]
results = {}
THRESHOLD = 0.5
seed = 42
set_seed(seed)

In [13]:
eval_labels = np.load('drive/MyDrive/eval_labels.npy')
eval_outputs = np.load('drive/MyDrive/eval_outputs.npy')[:, models]
test_outputs = np.load('drive/MyDrive/test_outputs.npy')[:, models]

print(eval_labels.shape, eval_outputs.shape, test_outputs.shape)

(125000,) (125000, 5) (10000, 5)


In [14]:
trainX, testX, trainY, testY = train_test_split(eval_outputs, eval_labels, test_size=0.5, random_state=seed)
print(trainX.shape, testX.shape)

(62500, 5) (62500, 5)


In [15]:
predictions = np.where(np.mean(testX, axis=1) >= THRESHOLD, 1, 0)
print(f"averaging accuracy: {accuracy_score(predictions, testY):.3%}")

results["avg"] = np.mean(test_outputs, axis=1)
results["avg"] = np.where(results["avg"] >= THRESHOLD, 1, -1)

averaging accuracy: 92.091%


In [16]:
names = ["linear", "logistic", "ridge"]
models = [LinearRegression(), LogisticRegression(), Ridge()]

for name, model in zip(names, models):
  model.fit(trainX, trainY)
  print(f"\n[{name}]\nModel Importance Weights:\n{model.coef_}")

  predictions = model.predict(testX)
  predictions = np.where(predictions >= THRESHOLD, 1, 0)

  print(f"Accuracy: {accuracy_score(predictions, testY):.3%}")

  results[name] = model.predict(test_outputs)
  results[name] = np.where(results[name] >= THRESHOLD, 1, -1)
  diff = np.sum(results["avg"] != results[name])
  print(f"Predictions Changed: {diff}")


[linear]
Model Importance Weights:
[0.17102246 0.15504503 0.3241998  0.11704742 0.18421954]
Accuracy: 92.126%
Predictions Changed: 75

[logistic]
Model Importance Weights:
[[1.11279969 0.95518546 1.98450304 0.99651275 1.36162118]]
Accuracy: 92.123%
Predictions Changed: 41

[ridge]
Model Importance Weights:
[0.17083856 0.15519723 0.3237853  0.11549127 0.18620732]
Accuracy: 92.125%
Predictions Changed: 74


In [17]:
for name in names:
  save_outputs(results[name], name)