In [1]:
#pip install statements
!pip install torch torchvision torchaudio
!pip install datasets
!pip install sentencepiece
!pip install portalocker
!pip install pandas polars tabulate



In [2]:
#import statements
import torch

In [3]:
#set to cuda
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("device set to cuda")
else:
  device = torch.device("cpu")
  print("device set to CPU")

device set to cuda


In [4]:
#load data
from datasets import load_dataset

###DATASET CHOICE: YAHOO ANSWERS DATASET (can cap at 70k examples)
yahoo_ds = load_dataset("community-datasets/yahoo_answers_topics")


#yahoo ds labels
print(yahoo_ds)

#train set, test set
train_ds = yahoo_ds['train']
test_ds = yahoo_ds['test']

MAX_TRAIN = 140000 #so we have 70k in train when split to validation
SEED = 555

train_ds = train_ds.shuffle(seed=SEED).select(range(MAX_TRAIN))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 1400000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer'],
        num_rows: 60000
    })
})


In [5]:
from datasets.features.video import Example
#we want to clean the text
#all lower case
#remove remove any characters that aren't lowercase, numbers, whitespace.
import regex as re
import pandas as pd
import polars as pl
from typing import Any

def clean_text(yahoo_text: dict[str, Any]) -> dict[str, Any]:

  ##fancy upgraded version yay, have basic version listing everything out
  yahoo_parts = {'question_title': 'title_clean',
                 'question_content': 'content_clean',
                 'best_answer' : 'answer_clean'}

  for start_label, end_label in yahoo_parts.items():
    lower_case = str(yahoo_text.get(start_label, "")).lower()
    yahoo_text[end_label] = re.sub(r'[^a-z0-9\s]', '', lower_case)

  return yahoo_text

train_ds = train_ds.map(clean_text)
test_ds = test_ds.map(clean_text)


In [6]:
print(train_ds.column_names)


['id', 'topic', 'question_title', 'question_content', 'best_answer', 'title_clean', 'content_clean', 'answer_clean']


In [7]:
#save things
import os
import pickle
from typing import Any

def save_file(data_to_save: Any, dir_path: str, file_name:str) -> None:
  os.makedirs(dir_path, exist_ok = True)
  save_path = os.path.join(dir_path, file_name)

  with open(save_path, "w") as file:
    for row in data_to_save:
      cleaned_data = [str(value) for key, value in row.items() if key.endswith("_clean")]
      if cleaned_data:
        file.write(" ".join(cleaned_data) + "\n")

  print(f'saved item to {save_path}')

PREPROCESSED_FOLDER = 'preprocessed_data'

save_file(train_ds, PREPROCESSED_FOLDER, 'train_clean.txt')
#do we want this for test too?
save_file(test_ds, PREPROCESSED_FOLDER, 'test_clean.txt')

saved item to preprocessed_data/train_clean.txt
saved item to preprocessed_data/test_clean.txt


In [8]:
#token
#does vocab size matter?

VOCAB_SIZE = 8000

#bpe tokenization
import sentencepiece as spm

BPE_MODEL_FOLDER = 'bpe_model'
os.makedirs(BPE_MODEL_FOLDER, exist_ok = True)

model_description = os.path.join(BPE_MODEL_FOLDER, f"bpe_model_{VOCAB_SIZE}")
spm.SentencePieceTrainer.train(
    input = os.path.join(PREPROCESSED_FOLDER, 'train_clean.txt'),
    model_prefix=model_description,
    vocab_size = VOCAB_SIZE,
    model_type = 'bpe'
)

In [9]:
#covert cleaned text to BPE tokens
sp = spm.SentencePieceProcessor(model_file=model_description+'.model')

def tokenization(yahoo_text: dict[str, Any]) -> dict[str, Any]:
  yahoo_parts = {'title_clean': 'title',
                 'content_clean': 'content',
                 'answer_clean': 'answer'}

  for start, label_part in yahoo_parts.items():
    text = str(yahoo_text.get(start, ""))

    tokens_str = ' '.join(sp.encode(text, out_type=str))
    token_idx = sp.encode(text, out_type=int) #easier to do int than int(tok_idx) for tok_idx in idx_tokenized.split()

    yahoo_text[f"{label_part}_tokenized"] = tokens_str
    yahoo_text[f"{label_part}_indices"] = token_idx
    yahoo_text[f"{label_part}_sequence_length"] = len(token_idx)

  return yahoo_text

train_ds = train_ds.map(tokenization)
test_ds = test_ds.map(tokenization)

print(train_ds)

Dataset({
    features: ['id', 'topic', 'question_title', 'question_content', 'best_answer', 'title_clean', 'content_clean', 'answer_clean', 'title_tokenized', 'title_indices', 'title_sequence_length', 'content_tokenized', 'content_indices', 'content_sequence_length', 'answer_tokenized', 'answer_indices', 'answer_sequence_length'],
    num_rows: 140000
})


In [10]:
#train, validation
#stratified test train split? split by topic

split_train_ds = train_ds.train_test_split(test_size = 0.5, stratify_by_column="topic")

print(split_train_ds)

train_dataset = split_train_ds['train']
valid_dataset = split_train_ds['test']
test_dataset = test_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer', 'title_clean', 'content_clean', 'answer_clean', 'title_tokenized', 'title_indices', 'title_sequence_length', 'content_tokenized', 'content_indices', 'content_sequence_length', 'answer_tokenized', 'answer_indices', 'answer_sequence_length'],
        num_rows: 70000
    })
    test: Dataset({
        features: ['id', 'topic', 'question_title', 'question_content', 'best_answer', 'title_clean', 'content_clean', 'answer_clean', 'title_tokenized', 'title_indices', 'title_sequence_length', 'content_tokenized', 'content_indices', 'content_sequence_length', 'answer_tokenized', 'answer_indices', 'answer_sequence_length'],
        num_rows: 70000
    })
})


In [11]:
print(train_ds.column_names)

['id', 'topic', 'question_title', 'question_content', 'best_answer', 'title_clean', 'content_clean', 'answer_clean', 'title_tokenized', 'title_indices', 'title_sequence_length', 'content_tokenized', 'content_indices', 'content_sequence_length', 'answer_tokenized', 'answer_indices', 'answer_sequence_length']


In [12]:
#batches
from typing import Sequence

def generate_batch(batch: Sequence[dict]) -> tuple[torch.tensor, torch.tensor, torch.tensor]:
  batch_indices = []
  batch_labels = []
  offsets = [0]

  for item in batch:

    ##note i have {label_part}_indices, {label_part}_tokenized, and {label_part}_sequence_length
    #combine both content and answer
    combined_indices = (item['content_indices'] + item['answer_indices'])
    batch_indices.extend(combined_indices)
    batch_labels.append(int(item['topic']))
    offsets.append(len(batch_indices)) #offset so know where to start for next

    batch_indices_tensor = torch.tensor(batch_indices, dtype = torch.long)
    batch_labels_tensor = torch.tensor(batch_labels, dtype=torch.long)
    offsets_tensor = torch.tensor(offsets[:-1], dtype = torch.long)

  return batch_indices_tensor, offsets_tensor, batch_labels_tensor


####NEED TO REMEMBER FOR LATER THAT COMBINED CONTENT INDICES AND ANSWER INDICES, SO COMBINE THESE?


In [13]:
#for SLP
BATCH_SIZE = 512
EMBED_DIM = 128
NUM_CLASSES = 10 #0-9 topics in dataset

SEED = 555
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [14]:
#slp class
import torch.nn as nn

class SLP(nn.Module):
  def __init__(self, vocab_size: int, embed_dim: int, num_class: int) -> None:
    super().__init__()
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse = False)
    self.fc = nn.Linear(embed_dim, num_class)
    #weights
    self.init_weights()

  def init_weights(self) -> None:
    #random unif sampled initialization value
    initrange = 0.5
    with torch.no_grad():
      self.embedding.weight.uniform_(-initrange, initrange)
      self.fc.weight.data.uniform_(-initrange, initrange)
      self.fc.bias.data.zero_()

  def forward(self, text: torch.tensor, offsets: torch.tensor) -> torch.tensor:
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

In [15]:
from torch.utils.data import DataLoader
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from typing import Optional

def train_func(train_loader: DataLoader, model: nn.Module, criterion: nn.Module, optimizer: Optimizer, scheduler: Optional[_LRScheduler]) -> tuple[float, float]: ###AD TYPE HINTS
  train_loss = 0
  train_acc = 0
  total_size = 0

  for i, (text, offsets, class_label) in enumerate(train_loader):
    if i % 200 == 0:
      print(f"batch {i}/{len(train_loader)}")
    total_size += len(class_label)
    optimizer.zero_grad() #clear grads
    text, offsets, class_label = text.to(device), offsets.to(device), class_label.to(device) #switching device

    #forward
    output = model(text, offsets)

    #loss
    loss = criterion(output, class_label)
    train_loss += (loss.item() * len(class_label))

    #backward
    loss.backward()

    #update weights
    optimizer.step()
    train_acc += (output.argmax(1) == class_label).sum().item()

  #learning rate
  if scheduler is not None:
    scheduler.step()
  return train_loss/total_size, train_acc / total_size

In [16]:
def valid_func(valid_loader: DataLoader, model: nn.Module, criterion: nn.Module) -> tuple[float, float]:
  loss = 0
  acc = 0
  total_size = 0
  for text, offsets, class_label in valid_loader:
    total_size += len(class_label)
    text, offsets, class_label = text.to(device), offsets.to(device), class_label.to(device)

    with torch.no_grad(): #no gradient update
      output = model(text, offsets)
      batch_loss = criterion(output, class_label)
      loss += batch_loss.item()
      acc += (output.argmax(1) == class_label).sum().item()
  return loss/total_size, acc/total_size

In [17]:
def test_func(test_loader: DataLoader, model: nn.Module) -> tuple[float, list[int], list[int]]:
  acc = 0
  total_size = 0
  prediction = []
  ground_truth = []

  for text, offsets, class_label in test_loader:
    total_size += len(class_label)
    text, offsets, class_label = text.to(device), offsets.to(device), class_label.to(device)

    with torch.no_grad():
      output = model(text, offsets)
      acc += (output.argmax(1) == class_label).sum().item()

      prediction.extend(output.argmax(1).tolist())
      ground_truth.extend(class_label.tolist())

  return acc/total_size, prediction, ground_truth

In [18]:
#now train model
import time

def train_model(model: nn.Module, model_name: str) -> tuple[list[float], list[float], list[float], float, list[int], list[int]]:
  print(model_name)
  N_EPOCHS = 25 ##can change this perhaps
  loss = []
  acc = []
  val_acc = []
  test_acc_per_epoch = []

  min_valid_loss = float('inf')

  criterion = torch.nn.CrossEntropyLoss().to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr = 3e-3) #could edit learningrate or optimizer
  scheduler = None #for now with adam

  train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True, collate_fn = generate_batch, pin_memory=True, num_workers = 0)
  valid_loader = DataLoader(valid_dataset, batch_size = BATCH_SIZE, shuffle = False, collate_fn=generate_batch, pin_memory=True, num_workers = 0)
  test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle = False, collate_fn = generate_batch)

  for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(train_loader, model, criterion, optimizer, scheduler)
    valid_loss, valid_acc = valid_func(valid_loader, model, criterion)

    if epoch % 1 == 0:
      #print only sometimes, not every time
      print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
      print(f'\t Val. Acc: {valid_acc*100:.2f}%')

    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)

    #move inside per epoch so report correctly
    test_acc, prediction, ground_truth = test_func(test_loader, model)
    test_acc_per_epoch.append(test_acc)

  return loss, acc, val_acc, test_acc_per_epoch, prediction, ground_truth


In [19]:
import numpy as np
import matplotlib.pyplot as plt

model_SLP = SLP(VOCAB_SIZE, EMBED_DIM, NUM_CLASSES).to(device)
loss_lvl1, acc_lvl1, val_acc_lvl1, test_acc_lvl1, prediction_lvl1, ground_truth_lvl1 = train_model(model_SLP, "Single Layer Perceptron")

Single Layer Perceptron
batch 0/137
	Train Loss: 1.963 | Train Acc: 42.09%
	 Val. Acc: 53.47%
batch 0/137
	Train Loss: 1.443 | Train Acc: 58.79%
	 Val. Acc: 58.40%
batch 0/137
	Train Loss: 1.229 | Train Acc: 63.65%
	 Val. Acc: 59.78%
batch 0/137
	Train Loss: 1.116 | Train Acc: 66.37%
	 Val. Acc: 60.10%
batch 0/137
	Train Loss: 1.040 | Train Acc: 68.23%
	 Val. Acc: 59.83%
batch 0/137
	Train Loss: 0.982 | Train Acc: 69.64%
	 Val. Acc: 59.62%
batch 0/137
	Train Loss: 0.937 | Train Acc: 70.90%
	 Val. Acc: 59.45%
batch 0/137
	Train Loss: 0.901 | Train Acc: 71.77%
	 Val. Acc: 59.12%
batch 0/137
	Train Loss: 0.869 | Train Acc: 72.68%
	 Val. Acc: 58.66%
batch 0/137
	Train Loss: 0.843 | Train Acc: 73.56%
	 Val. Acc: 58.31%
batch 0/137
	Train Loss: 0.820 | Train Acc: 74.26%
	 Val. Acc: 58.01%
batch 0/137
	Train Loss: 0.800 | Train Acc: 74.74%
	 Val. Acc: 57.70%
batch 0/137
	Train Loss: 0.782 | Train Acc: 75.27%
	 Val. Acc: 57.22%
batch 0/137
	Train Loss: 0.766 | Train Acc: 75.81%
	 Val. Acc: 56.

In [24]:
import pickle

results = {
    "slp_loss": loss_lvl1,
    "slp_train_acc": acc_lvl1,
    "slp_val_acc": val_acc_lvl1,
    "slp_test_acc": test_acc_lvl1,
    "slp_predictions": prediction_lvl1,
    "slp_ground_truth": ground_truth_lvl1
}

with open("slp_results.pkl", "wb") as f:
    pickle.dump(results, f)

print('saved results')

saved results


In [25]:
from google.colab import drive
drive.mount('/content/drive')

SAVE_PATH = "/content/drive/MyDrive/slp_results.pkl"

with open(SAVE_PATH, "wb") as f:
    pickle.dump(results, f)

print("saved results to drive")

Mounted at /content/drive
saved results to drive


In [20]:
#mlp class

#insert a hidden layer between embedding layer and final classification layer

#test for both hidden dim size 100 and hidden dim size 500

In [21]:
from tabulate import tabulate

headers = ["Model", "Test Accuracy"]
table = [['Single-Layer', test_acc_lvl1]]

print(tabulate(table, headers, tablefmt="pretty"))

+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|    Model     |                                                                                                                                                                                                 Test Accuracy                                                                                                                                                                                                  |
+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
#mlp class --- copied slp, now editing
import torch.nn as nn

class MLP(nn.Module):
  def __init__(self, vocab_size: int, embed_dim: int, num_class: int, hidden_dim: int) -> None:
    super().__init__()
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse = False)
    self.fc = nn.Sequential(nn.Linear(embed_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, num_class))
    #weights
    self.init_weights()

  def init_weights(self) -> None:
    #random unif sampled initialization value
    initrange = 0.5
    with torch.no_grad():
      for layer in self.modules():
        if isinstance(layer, nn.Linear):
          layer.weight.data.uniform_(-initrange, initrange)
          layer.bias.data.zero_()
        if isinstance(layer, nn.EmbeddingBag):
          layer.weight.uniform_(-initrange, initrange)
          layer.weight.uniform_(-initrange, initrange)

  def forward(self, text: torch.tensor, offsets: torch.tensor) -> torch.tensor:
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

In [26]:
#then train MLP
HIDDEN_DIMS = [100, 500]

#fix to not overwrite

#save test performance at each epoch of MLP
#compare performance against SLP baseline
#compare 100 and 500 performance
dim_mlp_outputs = {}
#error analysis

for dim in HIDDEN_DIMS:
  model_MLP = MLP(VOCAB_SIZE, EMBED_DIM, NUM_CLASSES, dim).to(device)
  loss_lvl_mlp, acc_lvl_mlp, val_acc_lvl_mlp, test_acc_lvl_mlp, prediction_lvl_mlp, ground_truth_lvl_mlp = train_model(model_MLP, "Multi Layer Perceptron")
  dim_mlp_outputs[dim] = {"loss": loss_lvl_mlp, "accuracy": acc_lvl_mlp, "val_accuracy": val_acc_lvl_mlp,
                          "test_accuracy": test_acc_lvl_mlp, "prediction": prediction_lvl_mlp,
                          "ground_truth": ground_truth_lvl_mlp}

Multi Layer Perceptron
batch 0/137
	Train Loss: 1.781 | Train Acc: 43.16%
	 Val. Acc: 55.85%
batch 0/137
	Train Loss: 1.237 | Train Acc: 61.62%
	 Val. Acc: 59.23%
batch 0/137
	Train Loss: 1.057 | Train Acc: 66.80%
	 Val. Acc: 59.65%
batch 0/137
	Train Loss: 0.940 | Train Acc: 70.11%
	 Val. Acc: 59.13%
batch 0/137
	Train Loss: 0.853 | Train Acc: 72.72%
	 Val. Acc: 58.84%
batch 0/137
	Train Loss: 0.783 | Train Acc: 74.80%
	 Val. Acc: 58.28%
batch 0/137
	Train Loss: 0.720 | Train Acc: 76.81%
	 Val. Acc: 57.62%
batch 0/137
	Train Loss: 0.664 | Train Acc: 78.82%
	 Val. Acc: 57.23%
batch 0/137
	Train Loss: 0.612 | Train Acc: 80.52%
	 Val. Acc: 56.79%
batch 0/137
	Train Loss: 0.562 | Train Acc: 82.49%
	 Val. Acc: 56.31%
batch 0/137
	Train Loss: 0.514 | Train Acc: 84.17%
	 Val. Acc: 55.93%
batch 0/137
	Train Loss: 0.467 | Train Acc: 85.96%
	 Val. Acc: 55.71%
batch 0/137
	Train Loss: 0.422 | Train Acc: 87.61%
	 Val. Acc: 55.30%
batch 0/137
	Train Loss: 0.381 | Train Acc: 89.18%
	 Val. Acc: 54.8

In [27]:
import pickle
from google.colab import drive

mlp_results = {
    "hidden_dim_outputs": dim_mlp_outputs
}

with open("mlp_results.pkl", "wb") as f:
    pickle.dump(mlp_results, f)

print("mlp results saved locallu")

drive.mount('/content/drive')

SAVE_PATH = "/content/drive/MyDrive/mlp_results.pkl"

with open(SAVE_PATH, "wb") as f:
    pickle.dump(mlp_results, f)

print("mlp results saved to drive")

mlp results saved locallu
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mlp results saved to drive


In [None]:
#must include training loop
#eval loop
#experiment settings (dataset choice, hyperparams, hidden dimension size, batch size)

Report test performance (i.e., accuracy) at each epoch of your MLP (must be a plot).
Compare the performance of the two-layer MLP against the SLP baseline from the in-class approach (adapted to the selected dataset)
Performance comparison of different hidden dimension sizes (100 -> 500)
Perform error analysis. Provide at least 5 incorrectly classified examples. For each, briefly explain why you think the model failed (e.g., ambiguity, rare word, lack of context)

In [1]:
#will have to load things that were saved
import pickle
from google.colab import drive

drive.mount('/content/drive')

MLP_PATH = "/content/drive/MyDrive/mlp_results.pkl"
SLP_PATH = "/content/drive/MyDrive/slp_results.pkl"

with open(MLP_PATH, "rb") as f:
    mlp_results = pickle.load(f)
with open(SLP_PATH, "rb") as f:
    slp_results = pickle.load(f)

print("keys:")
print("MLP:", mlp_results.keys())
print("SLP:", slp_results.keys())

Mounted at /content/drive
keys:
MLP: dict_keys(['hidden_dim_outputs'])
SLP: dict_keys(['slp_loss', 'slp_train_acc', 'slp_val_acc', 'slp_test_acc', 'slp_predictions', 'slp_ground_truth'])


In [2]:
#recalling datasets using same seed so can analyze misclassified cases
from datasets import load_dataset

#same seed as before
SEED = 555
MAX_TRAIN = 140000

ds = load_dataset("community-datasets/yahoo_answers_topics")

train_ds = ds["train"]
test_ds  = ds["test"]

train_ds = train_ds.shuffle(seed=SEED).select(range(MAX_TRAIN))

split_train_ds = train_ds.train_test_split(
    test_size=0.5,
    stratify_by_column="topic",
    seed=SEED   # add this just to be extra safe
)

train_dataset = split_train_ds['train']
valid_dataset = split_train_ds['test']
test_dataset  = test_ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

yahoo_answers_topics/train-00000-of-0000(…):   0%|          | 0.00/241M [00:00<?, ?B/s]

yahoo_answers_topics/train-00001-of-0000(…):   0%|          | 0.00/270M [00:00<?, ?B/s]

yahoo_answers_topics/test-00000-of-00001(…):   0%|          | 0.00/21.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1400000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [3]:
!pip install pynarrative
#i just read about pynarrative so i wanted to try using it!
!pip install altair

Collecting pynarrative
  Downloading pynarrative-0.4-py3-none-any.whl.metadata (7.8 kB)
Downloading pynarrative-0.4-py3-none-any.whl (13 kB)
Installing collected packages: pynarrative
Successfully installed pynarrative-0.4


In [10]:
#test accuracy per epoch
import pandas as pd
import pynarrative as pyn
import altair as alt

#accuracy per epoch
slp_accuracy = slp_results['slp_test_acc']
hidden_dim_outputs = mlp_results['hidden_dim_outputs']
mlp100_accuracy = hidden_dim_outputs[100]['test_accuracy']
mlp500_accuracy = hidden_dim_outputs[500]['test_accuracy']

#create a df
df = pd.concat([pd.DataFrame({"epoch": range(1, len(slp_accuracy)+1), "test_accuracy": slp_accuracy, "model": "SLP"}),
               pd.DataFrame({"epoch": range(1, len(mlp100_accuracy)+1), "test_accuracy": mlp100_accuracy, "model": "MLP (100 hidden dim)"}),
               pd.DataFrame({"epoch": range(1, len(mlp500_accuracy)+1), "test_accuracy": mlp500_accuracy, "model": "MLP (500 hidden dim)"})], ignore_index = True)
py_chart = (
    pyn.Story(df, width = 500, height = 500).mark_line(point=True).encode(
        x = alt.X("epoch:Q", title="Epoch", axis = alt.Axis(tickMinStep=1)),
        y = alt.Y("test_accuracy:Q", title="Test Accuracy", scale = alt.Scale(domain=[0,1])),
        color = alt.Color("model:N", title=None)).add_title("Test Accuracy Per Epoch", "SLP vs MLP (hidden = 100, hidden = 500)")
    )

py_chart.render()


In [12]:
#slp v mlp
print("best slp test acc:", max(slp_accuracy), "at epoch", slp_accuracy.index(max(slp_accuracy))+1)
print("best mlp test acc - 100 dim:", max(mlp100_accuracy), "at epoch", mlp100_accuracy.index(max(mlp100_accuracy))+1)
print("best mlp test acc - 500 dim:", max(mlp500_accuracy), "at epoch", mlp500_accuracy.index(max(mlp500_accuracy))+1)

best slp test acc: 0.60115 at epoch 4
best mlp test acc - 100 dim: 0.5958166666666667 at epoch 3
best mlp test acc - 500 dim: 0.5888833333333333 at epoch 3


In [None]:
#hidden dim comparison

In [24]:
#error analyses
#let's look at ones that all got wrong and ones some models got right

slp_preds = slp_results['slp_predictions']
slp_truth = slp_results['slp_ground_truth']

mlp100_outputs = hidden_dim_outputs[100]
mlp500_outputs = hidden_dim_outputs[500]
mlp100_preds = mlp100_outputs['prediction']
mlp100_truth = mlp100_outputs['ground_truth']
mlp500_preds = mlp500_outputs['prediction']
mlp500_truth = mlp500_outputs['ground_truth']

#labels - check matches what is in latex chart
label_names = {
    0: "Society & Culture",
    1: "Science & Mathematics",
    2: "Health",
    3: "Education & Reference",
    4: "Computers & Internet",
    5: "Sports",
    6: "Business & Finance",
    7: "Entertainment & Music",
    8: "Family & Relationships",
    9: "Politics & Government"
}

#find ones wrong in all models
all_wrong = [i for i in range(len(slp_truth)) if slp_preds[i] != slp_truth[i] and mlp100_preds[i] != mlp100_truth[i] and mlp500_preds[i] != mlp500_truth[i]]

print(len(all_wrong))

#let's take 5 of them
for i in all_wrong[:5]:
  print(f'True label: {label_names[slp_truth[i]]}') #can choose any truth
  print(f'SLP Prediction: {label_names[slp_preds[i]]}')
  print(f'MLP 100 dim Prediction: {label_names[mlp100_preds[i]]}')
  print(f'MLP 500 dim Prediction: {label_names[mlp500_preds[i]]}')
  print(f'Question Title:')
  print(f'{test_dataset[i]['question_title']}')
  print(f'\n')
  print(f'Question Content:')
  print(f'{test_dataset[i]['question_content']}')
  print(f'\n')
  print(f'Question Answer:')
  print(f'{test_dataset[i]['best_answer']}')
  print(f'\n')

21786
True label: Family & Relationships
SLP Prediction: Entertainment & Music
MLP 100 dim Prediction: Entertainment & Music
MLP 500 dim Prediction: Entertainment & Music
Question Title:
What makes friendship click?


Question Content:
How does the spark keep going?


Question Answer:
good communication is what does it.  Can you move beyond small talk and say what's really on your mind.  If you start doing this, my expereince is that potentially good friends will respond or shun you.  Then you know who the really good friends are.


True label: Science & Mathematics
SLP Prediction: Education & Reference
MLP 100 dim Prediction: Education & Reference
MLP 500 dim Prediction: Education & Reference
Question Title:
Why does Zebras have stripes?


Question Content:
What is the purpose or those stripes? Who do they serve the Zebras in the wild life?


Question Answer:
this provides camouflage - predator vision is such that it is usually difficult for them to see complex patterns


True label: 

In [25]:
#results where only SLP correct
mlp_wrong = [i for i in range(len(slp_truth)) if slp_preds[i] == slp_truth[i] and mlp100_preds[i] != mlp100_truth[i] and mlp500_preds[i] != mlp500_truth[i]]

print(len(mlp_wrong))

#let's take 5 of them
for i in mlp_wrong[:5]:
  print(f'True label: {label_names[slp_truth[i]]}') #can choose any truth
  print(f'SLP Prediction: {label_names[slp_preds[i]]}')
  print(f'MLP 100 dim Prediction: {label_names[mlp100_preds[i]]}')
  print(f'MLP 500 dim Prediction: {label_names[mlp500_preds[i]]}')
  print(f'Question Title:')
  print(f'{test_dataset[i]['question_title']}')
  print(f'\n')
  print(f'Question Content:')
  print(f'{test_dataset[i]['question_content']}')
  print(f'\n')
  print(f'Question Answer:')
  print(f'{test_dataset[i]['best_answer']}')
  print(f'\n')

2381
True label: Computers & Internet
SLP Prediction: Computers & Internet
MLP 100 dim Prediction: Education & Reference
MLP 500 dim Prediction: Entertainment & Music
Question Title:
what is the best podcast to subscribe to?


Question Content:



Question Answer:
This Week in Tech is a great podcast.  It's hosted by Leo Laport (of TechTV fame) and features several guests each week, usually other ex-TechTV people and John Devorak.  It's basically a discussion of tech news stories from the week.  It's been the top podcast for some time now, probably because of the well known, intelligent hosts as well as the professional production quality.\n\nBrowsing the podcasts available on iTunes can be a good way to easily sample some podcasts on topics you're interested in.


True label: Health
SLP Prediction: Health
MLP 100 dim Prediction: Science & Mathematics
MLP 500 dim Prediction: Science & Mathematics
Question Title:
What's the best way to fight a cold?


Question Content:



Question Answe

In [26]:
#results where MLP correct
slp_wrong = [i for i in range(len(slp_truth)) if slp_preds[i] != slp_truth[i] and (mlp100_preds[i] == mlp100_truth[i] or mlp500_preds[i] == mlp500_truth[i])]

print(len(slp_wrong))

#let's take 5 of them
for i in slp_wrong[:5]:
  print(f'True label: {label_names[slp_truth[i]]}') #can choose any truth
  print(f'SLP Prediction: {label_names[slp_preds[i]]}')
  print(f'MLP 100 dim Prediction: {label_names[mlp100_preds[i]]}')
  print(f'MLP 500 dim Prediction: {label_names[mlp500_preds[i]]}')
  print(f'Question Title:')
  print(f'{test_dataset[i]['question_title']}')
  print(f'\n')
  print(f'Question Content:')
  print(f'{test_dataset[i]['question_content']}')
  print(f'\n')
  print(f'Question Answer:')
  print(f'{test_dataset[i]['best_answer']}')
  print(f'\n')

5539
True label: Science & Mathematics
SLP Prediction: Society & Culture
MLP 100 dim Prediction: Science & Mathematics
MLP 500 dim Prediction: Society & Culture
Question Title:
Is the universe flat?


Question Content:



Question Answer:
Yes, the Universe is flat. It's a VERY difficult thing for most people to wrap their head around. You're Correct.\nJust because you see a round ball, does not necessitate the ball being round. I think the real question is..."Why can't we see it" Einstein said that Reality is a thin, yet persistant veil. Good Luck.


True label: Politics & Government
SLP Prediction: Health
MLP 100 dim Prediction: Politics & Government
MLP 500 dim Prediction: Health
Question Title:
how often does the president visit Bethesda Naval hospital?


Question Content:



Question Answer:
Whenever a full routine checkup is scheduled. \n\nRegular or emergency treatments are also available, for instance Bill Clinton visited in 1996 after multiple cocaine overdoses and was informed