In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn

from sklearn import svm
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import confusion_matrix, classification_report

import torch.optim
import torch.nn as nn

# local
from tokenizer import simple_tokenizer

# run data_processing.ipynb to generate the train, val, test sets here
# load and split train, val, test, (X, y)
TRAIN_DATA = 'data/training.1440000.csv'
VAL_DATA = 'data/validation.80000.csv'
TEST_DATA = 'data/test.80000.csv'

In [4]:
!pip install torch

Collecting torch
  Downloading torch-1.11.0-cp37-cp37m-win_amd64.whl (158.0 MB)
Installing collected packages: torch
Successfully installed torch-1.11.0


distutils: c:\users\yindy\miniconda3\Include\UNKNOWN
sysconfig: c:\users\yindy\miniconda3\Include
user = False
home = None
root = None
prefix = None
distutils: c:\users\yindy\miniconda3\Include\UNKNOWN
sysconfig: c:\users\yindy\miniconda3\Include
user = False
home = None
root = None
prefix = None
You should consider upgrading via the 'c:\users\yindy\miniconda3\python.exe -m pip install --upgrade pip' command.


In [6]:
df_train = pd.read_csv(TRAIN_DATA)
df_val = pd.read_csv(VAL_DATA)
df_test = pd.read_csv(TEST_DATA)

X_train, y_train = df_train['text'], df_train['target']
X_val, y_val = df_val['text'], df_val['target']
X_test, y_test = df_test['text'], df_test['target']

In [7]:
from transformers import BertModel, BertTokenizer

bert_weights_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
#X_train_ids = [bert_tokenizer.encode(txt) for txt in X_train
#X_train_ids = [bert_tokenizer.encode(txt) for txt in X_train[:10000]]

max_length = 100

sample_size = 30000
X_train_info = bert_tokenizer.batch_encode_plus(X_train[:sample_size], max_length=max_length, padding='max_length', truncation=True)
train_x = X_train_info['input_ids']

X_val_info = bert_tokenizer.batch_encode_plus(X_val[:sample_size], max_length=max_length, padding='max_length', truncation=True)
val_x = X_val_info['input_ids']

X_test_info = bert_tokenizer.batch_encode_plus(X_test[:sample_size], max_length=max_length, padding='max_length', truncation=True)
test_x = X_test_info['input_ids']

train_y = y_train[:sample_size]
val_y = y_val[:sample_size]
test_y = y_test[:sample_size]


In [9]:
train_x = np.array(train_x)
val_x = np.array(val_x)
test_x = np.array(test_x)
train_y = np.array(train_y)
val_y = np.array(val_y)
test_y = np.array(test_y)

import torch
from torch.utils.data import DataLoader, TensorDataset
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
val_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
# dataloaders
batch_size = 1000
# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)


In [10]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)


Sample input size:  torch.Size([1000, 100])
Sample input: 
 tensor([[  101, 10166,  1010,  ...,     0,     0,     0],
        [  101,  2158,  1045,  ...,     0,     0,     0],
        [  101, 24471,  2140,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2293,  ...,     0,     0,     0],
        [  101,  9850,  1038,  ...,     0,     0,     0],
        [  101,  2437,  7852,  ...,     0,     0,     0]], dtype=torch.int32)

Sample label size:  torch.Size([1000])
Sample label: 
 tensor([0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
        1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
        0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
        1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 

In [11]:
class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

# Instantiate the model w/ hyperparams
vocab_size = 30000
output_size = 1
embedding_dim = 200
hidden_dim = 128
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

In [12]:
train_on_gpu = False
# loss and optimization functions
lr=0.005

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 2

counter = 0
print_every = 1
clip=5 # gradient clipping

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in val_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs = inputs.type(torch.LongTensor)
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))


Epoch: 1/2... Step: 1... Loss: 0.692638... Val Loss: 0.696634
Epoch: 1/2... Step: 2... Loss: 0.700616... Val Loss: 0.792424
Epoch: 1/2... Step: 3... Loss: 0.775563... Val Loss: 0.709380
Epoch: 1/2... Step: 4... Loss: 0.705125... Val Loss: 0.693123
Epoch: 1/2... Step: 5... Loss: 0.693283... Val Loss: 0.696011
Epoch: 1/2... Step: 6... Loss: 0.695297... Val Loss: 0.697815
Epoch: 1/2... Step: 7... Loss: 0.699915... Val Loss: 0.696370
Epoch: 1/2... Step: 8... Loss: 0.698147... Val Loss: 0.694602
Epoch: 1/2... Step: 9... Loss: 0.692488... Val Loss: 0.693578
Epoch: 1/2... Step: 10... Loss: 0.693171... Val Loss: 0.693182
Epoch: 1/2... Step: 11... Loss: 0.694721... Val Loss: 0.693135
Epoch: 1/2... Step: 12... Loss: 0.694043... Val Loss: 0.693288
Epoch: 1/2... Step: 13... Loss: 0.693511... Val Loss: 0.693551
Epoch: 1/2... Step: 14... Loss: 0.693908... Val Loss: 0.693633
Epoch: 1/2... Step: 15... Loss: 0.693455... Val Loss: 0.693721
Epoch: 1/2... Step: 16... Loss: 0.693952... Val Loss: 0.693754
E

In [13]:
# Get test data loss and accuracy

test_losses = [] # track loss
preds = []
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    preds.append(pred)
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2048000 bytes.

In [138]:
num_correct

4973

In [139]:
test_loader.dataset

<torch.utils.data.dataset.TensorDataset at 0x157af51cc40>

In [51]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.losses import binary_crossentropy
from keras import backend as K
import keras.layers as L
from tensorflow.keras.layers import Layer, InputSpec
from keras import initializers, regularizers, constraints, optimizers, layers

from keras.models import Model
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold


In [144]:
def get_bert_embed_matrix():
    bert_embeddings = list(bert_model.children())[0]
    bert_word_embeddings = list(bert_embeddings.children())[0]
    mat = bert_word_embeddings.weight.data.numpy()
    return mat
embedding_matrix = get_bert_embed_matrix()


In [143]:
print(tf.__version__)

2.9.1


In [55]:
MAXLEN = 250
def build_model(embedding_matrix, loss_weight):
    '''
    credits go to: https://www.kaggle.com/thousandvoices/simple-lstm/
    '''
    words = Input(shape=(MAXLEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([GlobalMaxPooling1D()(x),GlobalAveragePooling1D()(x),])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss=[custom_loss,'binary_crossentropy'], loss_weights=[loss_weight, 1.0], optimizer='adam')

    return model
def custom_loss(y_true, y_pred):
    return binary_crossentropy(K.reshape(y_true[:,0],(-1,1)), y_pred) * y_true[:,1]
model = build_model(embedding_matrix, y_aux_train.shape[-1],loss_weight)


In [None]:
build_model

In [2]:
from tokenizer import simple_tokenizer
simple_tokenizer(X_val, pad_len=64)

NameError: name 'X_val' is not defined

In [4]:
X_val_v = simple_tokenizer(X_val, pad_len=64)


In [10]:
max([len(x) for x in X_val_v])

64

In [8]:
X_val_v[:3]

[array([   1, 3223,   51,   21,   57,    5, 3946,   10,    5, 7012, 1188,
         148,   23, 8636,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]),
 array([    1,    80,    32,  3947,    14,  3224,     7, 28329,  4585,
            9,  3947,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]),
 array([    1,  4586,   432,     3,  2069,   295,   

In [None]:
X_train_v[:3]

In [35]:
import numpy as np
np.pad(X_train_v[0], (0, 64 - len(X_train_v[0])), 'constant')

array([    1,   213,    73,   459,     9,   813, 47141,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0])

In [17]:
X_train_v[0]

[1, 213, 73, 459, 9, 813, 47141]

In [5]:
from transformers import RobertaModel, RobertaTokenizer
roberta_weights_name = 'roberta-base'
model = RobertaModel.from_pretrained(roberta_weights_name)
tokenizer = RobertaTokenizer.from_pretrained(roberta_weights_name)

def cls_phi(text):
    encoding = vsm.hf_encode(text, tokenizer, add_special_tokens=True)
    reps = vsm.hf_represent(encoding, model)
    cls_rep = reps[0][0]
    return cls_rep.cpu().numpy()


In [21]:
!pip install tensorflow
import keras

Collecting tensorflow
  Downloading tensorflow-2.9.1-cp39-cp39-win_amd64.whl (444.0 MB)
Collecting tensorflow-estimator<2.10.0,>=2.9.0rc0
  Downloading tensorflow_estimator-2.9.0-py2.py3-none-any.whl (438 kB)
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.4-cp39-cp39-win_amd64.whl (895 kB)
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting absl-py>=1.0.0
  Downloading absl_py-1.1.0-py3-none-any.whl (123 kB)
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting libclang>=13.0.0
  Downloading libclang-14.0.1-py2.py3-none-win_amd64.whl (14.2 MB)
Collecting gast<=0.4.0,>=0.2.1
  Downloading gast-0.4.0-py3-none-any.whl (9.8 kB)
Collecting grpcio<2.0,>=1.24.3
  Downloading grpcio-1.46.3-cp39-cp39-win_amd64.whl (3.5 MB)
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting tensorboard<2.10,>=2.9
  Downloading tensorboard-2.9.0-py3-none-any.whl 

In [141]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel
max_length = 100
def bert_tweets_model():
    bert_encoder = TFBertModel.from_pretrained(bert_weights_name)
    input_word_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    last_hidden_states = bert_encoder(input_word_ids)[0]    
    x = tf.keras.layers.LSTM(100, dropout=0.3, recurrent_dropout=0.3)(last_hidden_states)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=input_word_ids, outputs=output)
    
    return model


In [146]:
bert_encoder = TFBertModel.from_pretrained(bert_weights_name)

ImportError: 
TFBertModel requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
installation page: https://www.tensorflow.org/install and follow the ones that match your environment.


In [147]:
print(transformers.__version__)

4.17.0


In [43]:
import transformers
import tensorflow

In [142]:
model = bert_tweets_model()

ImportError: 
TFBertModel requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
installation page: https://www.tensorflow.org/install and follow the ones that match your environment.


In [10]:
print (count_words)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

