In [1]:
import torch
import torch.nn as nn
import torchtext
import csv 
from util import get_available_devices
from sentiment_util import evaluate
from models.sentiment_model import MovementPredictor
from torchtext.legacy import data
import spacy
import torch.optim as optim
import torch.optim.lr_scheduler as sched
from torchtext.vocab import GloVe
import torch.nn.functional as F

[nltk_data] Downloading package wordnet to /home/emilyjin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/emilyjin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
!pip install nltk
!pip install spacy==2.3.5
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
!pip install pyresparser
!sudo pip3 install -U spacy
!python3 -m spacy download en


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.1 MB/s eta 0:00:01    |█▍                              | 542 kB 8.1 MB/s eta 0:00:02     |███████████████████████████▍    | 10.3 MB 8.1 MB/s eta 0:00:01


Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/6d/0d/4379e9aa35a444b6440ffe1af4c612533460e0d5ac5c7dca1f96ff6f2e23/spacy-3.0.6.tar.gz (7.1MB)
[K    100% |████████████████████████████████| 7.1MB 210kB/s eta 0:00:01    41% |█████████████▍                  | 3.0MB 34.3MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25lerror
  Complete output from command /usr/bin/python3 -m pip install --ignore-installed --no-user --prefix /tmp/pip-build-env-cnfurlgg --no-warn-script-location --no-binary :none: --only-binary :none: -i https://pypi.org/simple -- setuptools cython>=0.25 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 thinc>=8.0.3,<8.1.0 blis>=0.4.0,<0.8.0 pathy numpy>=1.15.0:
  Collecting setuptools
    Downloading https://files.pythonhosted.org/packages/4e/78/56aa1b5f4d8ac548755ae767d84f0be54fdd9d404197a3d9e4659d272348/setuptools-57.0.0-py3-none-any.whl (821kB)
  Collecting cython>=0.25
    Downloading https://files.pythonhost

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/conda/lib/python3.7/site-packages/en_core_web_sm -->
/opt/conda/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [19]:
spacy.load('en', disable=['ner', 'parser', 'tagger'])
# nlp = spacy.load('en_core_web_sm')

# def tokenizer(text): # create a tokenizer function
#     return [tok.text for tok in nlp.tokenizer(text)]
def tokenize(s):
    return s.split(' ')

TEXT = data.Field(tokenize=tokenize, lower=True, include_lengths=True)
UPVOTE = data.LabelField(sequential=False, use_vocab=False, dtype=torch.int64)
CHANGE = data.LabelField(sequential=False, use_vocab=False, dtype=torch.float)
LABEL = data.LabelField(sequential=False, use_vocab=False, dtype=torch.int64)

In [12]:
def create_csv():
    with open('removed_characters.csv') as in_file:
        with open('removed_characters_buckets.csv', 'w') as out_file:
            reader = csv.reader(in_file, delimiter=',')
            writer = csv.writer(out_file)
            for row in reader:
                text = row[0].split(', ')
                text = ' '.join(text)
                row_data = [text]
                row_data.extend(row[-3:-1])
                label = 1 - float(row[-1])
                # Strong buy
                if label >= .03:
                    label = 0
                # Buy
                elif .01 < label < .03:
                    label = 1
                # Hold
                elif -.01 <= label <= .01:
                    label = 2
                # Sell
                elif -.01 > label > -.03:
                    label = 3
                else:
                    label = 4
                row_data.append(label)
                writer.writerow(row_data)
    in_file.close()

In [13]:
def data_preprocess(max_vocab_size, device, batch_size):

    # Map data to fields
    fields_text = [('text', TEXT), ('upvote', UPVOTE), ('change', CHANGE), ('label', LABEL)]

    # Apply field definition to create torch dataset
    dataset = data.TabularDataset(
        path="removed_characters_buckets.csv",
        format="CSV",
        fields=fields_text,
        skip_header=False)

    # Split data into train, test, validation sets
    (train_data, test_data, valid_data) = dataset.split(split_ratio=[0.8, 0.1, 0.1])

    print("Number of train data: {}".format(len(train_data)))
    print("Number of test data: {}".format(len(test_data)))
    print("Number of validation data: {}".format(len(valid_data)))

    # unk_init initializes words in the vocab using the Gaussian distribution
    TEXT.build_vocab(train_data,
                     max_size=max_vocab_size,
                     vectors="glove.6B.100d",
                     unk_init=torch.Tensor.normal_)

    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        device=device,
        batch_sizes=(batch_size, batch_size, batch_size),
        sort_key=lambda x: len(x.text),
        sort_within_batch=False)

    return train_iterator, valid_iterator, test_iterator

In [37]:
train = True
batch_size = 449
hidden_size = 256
drop_prob = 0.5
learning_rate = 1e-2 # TODO: hyper
num_epochs = 100
beta1, beta2 = 0.9, 0.999 # for Adam
alpha = 0.2 # for ELU # TODO: hyper
max_grad_norm = 1.0
print_every = 50
save_dir = 'results/model.path_lr_{:.4}_drop_prob_{:.4}_alpha_{:.4}.tar'.format(learning_rate, drop_prob, alpha)

device, gpu_ids = get_available_devices()

In [15]:
create_csv()
# train_iterator, valid_iterator, test_iterator = data_preprocess(25000, device, batch_size)

In [20]:
fields_text = [('text', TEXT), ('upvote', UPVOTE), ('change', CHANGE), ('label', LABEL)]

# Apply field definition to create torch dataset
dataset = data.TabularDataset(
    path="removed_characters_buckets.csv",
    format="CSV",
    fields=fields_text,
    skip_header=False)

In [21]:
# Split data into train, test, validation sets
(train_data, test_data, valid_data) = dataset.split(split_ratio=[0.8, 0.1, 0.1])

print("Number of train data: {}".format(len(train_data)))
print("Number of test data: {}".format(len(test_data)))
print("Number of validation data: {}".format(len(valid_data)))

Number of train data: 145027
Number of test data: 18129
Number of validation data: 18128


In [27]:
for i in range(3):
    print(f'Sample {i}: {vars(train_data[i])}')

Sample 0: {'text': ['spy', 'making', 'the', 'ol', 'ski', 'jump', 'pattern'], 'upvote': '6', 'change': '1.0006460879476995', 'label': '2'}
Sample 1: {'text': ['spy', 'opening', 'it', 's', 'butthole', 'for', 'tsla', 'inclusion'], 'upvote': '5', 'change': '1.0057336878130712', 'label': '2'}
Sample 2: {'text': ['thinking', 'about', 'buying', '', 'amc', 'c', '', 'as', 'a', 'yolo', 'if', 'they', 'survive', 'this', 'pandemic', 'and', 'don', 't', 'shutter', 'completely', 'this', 'is', 'all', 'off', 'a', 'gut', 'feeling', 'that', 'whoever', 'put', 'their', 'money', 'into', 'this', 'a', 'few', 'years', 'ago', 'is', 'going', 'to', 'do', 'everything', 'to', 'keep', 'it', 'from', 'dying'], 'upvote': '4', 'change': '0.8698884254252404', 'label': '4'}


In [35]:
# unk_init initializes words in the vocab using the Gaussian distribution
TEXT.build_vocab(train_data,
                 max_size=25000,
                 vectors="glove.6B.100d",
                 unk_init=torch.Tensor.normal_)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    device=device,
    batch_sizes=(batch_size, batch_size, batch_size),
    sort_key=lambda x: len(x.text),
    sort_within_batch=False)

In [29]:
# Initialize model.
model = MovementPredictor(
    vocab_size=287799,
    embedding_dim=100,
    hidden_dim=hidden_size,
    n_layers=2,
    bidirectional=True,
    dropout=drop_prob,
    pad_idx=TEXT.vocab.stoi[TEXT.pad_token],
    alpha=alpha
)

# pretrained_embeddings = TEXT.vocab.vectors
# model.embedding.weight.data.copy_(pretrained_embeddings)

model = nn.DataParallel(model, gpu_ids)

# Initialize optimizer and scheduler.
optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(beta1, beta2))
#scheduler = sched.LambdaLR(optimizer, lambda s: 1.)

# Training

In [36]:
iter = 0
checkpoint = 0

if train:
    for epoch in range(num_epochs):
        with torch.enable_grad():
            for vector in train_iterator:
                optimizer.zero_grad()
                # Grab labels.
                target = torch.zeros((batch_size, 5))
                target[torch.arange(batch_size), vector.label] = 1
                # Grab other data for multimodal sentiment analysis.
                multimodal_data = torch.cat((vector.upvote.unsqueeze(dim=1),
                                             vector.change.unsqueeze(dim=1)), dim=1) # Upvotes + past week change
                # Apply model
                y = model(vector, multimodal_data)
                target = target.to(device)
                loss_function = nn.BCEWithLogitsLoss()
                loss = loss_function(y, target)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                #scheduler.step(step // batch_size)
                if iter % print_every == 0:
                    print('Epoch:{}, Iter: {}, Loss:{:.4}'.format(epoch, iter, loss.item()))
                iter += 1

            torch.save(model, save_dir)
            if checkpoint % 3 == 0:
                print("evaluating on dev split...")
                loss_val, accuracy = evaluate(model, test_iterator, device)
                print("dev loss: ", loss_val, "dev accuracy: ", accuracy)
                checkpoint += 1

Epoch:0, Iter: 0, Loss:0.7637
Epoch:0, Iter: 50, Loss:0.7625
Epoch:0, Iter: 100, Loss:0.7601


IndexError: shape mismatch: indexing tensors could not be broadcast together with shapes [1024], [643]

# Testing

In [None]:
print("testing data, loading from path" + save_dir + " ...")
    model = torch.load(save_dir)
    loss_val, accuracy = evaluate(model, test_iterator, criterion=nn.BCEWithLogitsLoss())
    print("test loss: ", loss_val, "test accuracy: ", accuracy)