Diana Covaci, 261 086 280

Nicholas Milin, 261 106 314

Viktor Allais, 261 148 866

Link to write-up draft: https://docs.google.com/document/d/1SFnJJ0C4B64lkwmnU2XbigqCRX71LE-kLrOZSMkxHWE/edit?usp=sharing 

In [21]:
!pip install -q numpy pandas matplotlib seaborn scikit-learn ucimlrepo
!pip install -q torchvision gensim
!pip install -q tqdm boto3 requests regex sentencepiece sacremoses
!pip install -q pytorch-pretrained-bert transformers


import numpy as np
import matplotlib.pyplot as plt
import re
import torch
import torch.nn as nn
from gensim.models import Word2Vec
from collections import Counter
import sys
import random as rn
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from pytorch_pretrained_bert import BertModel, BertTokenizer
from torch.optim import Adam


  from .autonotebook import tqdm as notebook_tqdm


# Task 1: Acquire and pre-process the Web of Science Dataset

In [16]:
#  get text and labels from data folder
with open("WOS11967/X.txt", "r", encoding="utf-8") as f: 
    X = [line.strip() for line in f.readlines()]
with open("WOS11967/YL1.txt", "r") as f: 
    y1 = [int(line.strip()) for line in f.readlines()]
with open("WOS11967/YL2.txt", "r") as f: 
    y2 = [int(line.strip()) for line in f.readlines()]

labels = {0:"Computer Science", 1:"Electrical Engineering", 2:"Psychology", 3:"Mechanical Engineering", 4:"Civil Engineering", 5:"Medical Science",6:"Biochemistry"}

In [17]:
# define pre-processing helper
def clean_and_tokenize(text): 
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9(){}\[\].,!^:;\-_/]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()

def tokens_to_ids(tokens, word2idx): 
    return [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

In [18]:
# -- LSTM pre-processing pipeline --
# use word2vec becuase of dense semantic embeddings, faster convergence, capturing similarities, reducing sparsity and good flexibility

# initialize seed values for stable outcomes
rn.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)


# clean and tokenize text
tokenized_X_LSTM = [clean_and_tokenize(line) for line in X]
print(len(tokenized_X_LSTM))
# get word frequencies within the vocabulary
word_frequency = Counter(word for line in tokenized_X_LSTM for word in line)

# didn't implement a maximum vocabulary size, but given size X can take word_frequency.most_common(X-2) to avoid rarest words if improves model performance
idx2word = ['<PAD>', '<UNK>'] + list(word_frequency.keys())
word2idx = {word:idx for idx, word in enumerate(idx2word)}
vocab_size = len(word2idx)
print(vocab_size)

# produce sequences
MAX_LEN = 300   # larger than the average abstract of a scientific paper
sequences = [tokens_to_ids(line, word2idx) for line in tokenized_X_LSTM]
full_sequences = [seq[:MAX_LEN] + [0]*(MAX_LEN-len(seq)) if len(seq) < MAX_LEN else seq[:MAX_LEN] for seq in sequences]

embedding_dim = 100   # can make larger if this doesn't capture enough complexity
# increase min_count if too much noise for model
w2v_model = Word2Vec(sentences=tokenized_X_LSTM, vector_size=embedding_dim, min_count=1, sg=1)

# build embedding matrix
embedding_matrix = np.random.normal(size=(vocab_size, embedding_dim)) * 0.01
for word, idx in word2idx.items(): 
    if word in w2v_model.wv: 
        embedding_matrix[idx] = w2v_model.wv[word]

# prepare LSTM input
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float)
X_tensor = torch.tensor(full_sequences, dtype=torch.long)
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
embedded = embedding_layer(X_tensor)
print(embedded.shape)

11967
132677
torch.Size([11967, 300, 100])


In [24]:
# BERT -- to do, theres a tutorial that you can use
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

encoding = tokenizer(X, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
print(encoding['input_ids'].shape)


torch.Size([11967, 128])


In [None]:
#bert_model = AutoModel.from_pretrained('bert-base-uncased')

#with torch.no_grad(): 
#    outputs = bert_model(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'])
# embeddings = outputs.last_hidden_state
# cls_embeddings = embeddings[:, 0, :]