In [45]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, data):
        self.data = torch.Tensor(data)
        self.data.to('cuda')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return {'text': self.data.iloc[index][0], 'label': self.data.iloc[index][1]}


In [1]:
MULTIMODAL_SDK_PATH = "/home/dstratton/PycharmProjects/InterpretableMultimodal/CMU-MultimodalSDK"
import sys
sys.path.append(MULTIMODAL_SDK_PATH)

In [2]:
import mmsdk
from mmsdk import mmdatasdk as md
DATASET = md.cmu_mosi
DATA_PATH = "cmumosi"
train_split = DATASET.standard_folds.standard_train_fold
dev_split = DATASET.standard_folds.standard_valid_fold
test_split = DATASET.standard_folds.standard_test_fold

In [3]:
dataset = md.mmdataset({
    'CMU_MOSI_TimestampedWords': DATA_PATH + '/CMU_MOSI_TimestampedWords.csd',
    'CMU_MOSI_Opinion_Labels': DATA_PATH + '/CMU_MOSI_Opinion_Labels.csd'
})

[92m[1m[2021-12-26 20:06:23.916] | Success | [0mComputational sequence read from file cmumosi/CMU_MOSI_TimestampedWords.csd ...
[94m[1m[2021-12-26 20:06:23.922] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2021-12-26 20:06:23.922] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                   

[92m[1m[2021-12-26 20:06:23.958] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2021-12-26 20:06:23.958] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2021-12-26 20:06:23.959] | Success | [0mComputational sequence read from file cmumosi/CMU_MOSI_Opinion_Labels.csd ...
[94m[1m[2021-12-26 20:06:23.965] | Status  | [0mChecking the integrity of the <Opinion Segment Labels> computational sequence ...
[94m[1m[2021-12-26 20:06:23.965] | Status  | [0mChecking the format of the data in <Opinion Segment Labels> computational sequence ...


                                                                   

[92m[1m[2021-12-26 20:06:23.996] | Success | [0m<Opinion Segment Labels> computational sequence data in correct format.
[94m[1m[2021-12-26 20:06:23.997] | Status  | [0mChecking the format of the metadata in <Opinion Segment Labels> computational sequence ...
[92m[1m[2021-12-26 20:06:23.997] | Success | [0mDataset initialized successfully ... 




In [4]:
dataset.align('CMU_MOSI_Opinion_Labels')

[94m[1m[2021-12-26 20:06:28.296] | Status  | [0mUnify was called ...
[92m[1m[2021-12-26 20:06:28.296] | Success | [0mUnify completed ...
[94m[1m[2021-12-26 20:06:28.296] | Status  | [0mPre-alignment based on <CMU_MOSI_Opinion_Labels> computational sequence started ...
[94m[1m[2021-12-26 20:06:28.355] | Status  | [0mPre-alignment done for <CMU_MOSI_TimestampedWords> ...
[94m[1m[2021-12-26 20:06:28.357] | Status  | [0mAlignment starting ...


Overall Progress:   0%|          | 0/93 [00:00<?, ? Computational Sequence Entries/s]
  0%|          | 0/13 [00:00<?, ? Segments/s][A
Aligning 03bSnISJMiM:   0%|          | 0/13 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/25 [00:00<?, ? Segments/s][A
Aligning 0h-zjBukYpk:   0%|          | 0/25 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/14 [00:00<?, ? Segments/s][A
Aligning 1DmNV9C1hbY:   0%|          | 0/14 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/30 [00:00<?, ? Segments/s][A
Aligning 1iG0909rllw:   0%|          | 0/30 [00:00<?, ? Segments/s][A
                                                                   [A
  0%|          | 0/63 [00:00<?, ? Segments/s][A
Aligning 2WGyTLYerpo:   0%|          | 0/63 [00:00<?, ? Segments/s][A
                              

[92m[1m[2021-12-26 20:06:29.753] | Success | [0mAlignment to <CMU_MOSI_Opinion_Labels> complete.
[94m[1m[2021-12-26 20:06:29.753] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2021-12-26 20:06:29.754] | Success | [0mInitialized empty <CMU_MOSI_TimestampedWords> computational sequence.
[94m[1m[2021-12-26 20:06:29.754] | Status  | [0mChecking the format of the data in <CMU_MOSI_TimestampedWords> computational sequence ...


                                                                     

[92m[1m[2021-12-26 20:06:29.757] | Success | [0m<CMU_MOSI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2021-12-26 20:06:29.757] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_TimestampedWords> computational sequence ...
[92m[1m[2021-12-26 20:06:29.757] | Success | [0mInitialized empty <CMU_MOSI_Opinion_Labels> computational sequence.
[94m[1m[2021-12-26 20:06:29.757] | Status  | [0mChecking the format of the data in <CMU_MOSI_Opinion_Labels> computational sequence ...


                                                                     

[92m[1m[2021-12-26 20:06:29.761] | Success | [0m<CMU_MOSI_Opinion_Labels> computational sequence data in correct format.
[94m[1m[2021-12-26 20:06:29.761] | Status  | [0mChecking the format of the metadata in <CMU_MOSI_Opinion_Labels> computational sequence ...




In [5]:
segment_ids = list(dataset['CMU_MOSI_TimestampedWords'].keys())
# filter for test set
segment_ids = [vid for vid in segment_ids if any(substring in vid for substring in test_split)]

In [24]:
# getting data from test set
sentences = []
labels = []
for video_id in segment_ids:
    sentence = []
    for word in dataset['CMU_MOSI_TimestampedWords'][video_id]['features']:
        if word[0] != b'sp':
            sentence.append(word[0].decode('utf-8'))
    sent = ' '.join(sentence)
    sentences.append(sent)
    labels.append(dataset['CMU_MOSI_Opinion_Labels'][video_id]['features'][0][0])
    # you can also store interval information from dataset['CMU_MOSI_TimestampedWords'][video_id]['intervals'] if needed

import pandas as pd
text_data = pd.DataFrame({'text': sentences, 'labels': labels})

In [25]:
from datasets import ClassLabel
import numpy as np
text_data['labels'] = np.sign(text_data['labels']).astype('int32')
text_data['labels'][text_data['labels'] == -1] = 0
# ClassLabel()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['labels'][text_data['labels'] == -1] = 0


In [26]:
from datasets import Dataset

data = Dataset.from_pandas(text_data)

In [27]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
# tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
model = transformers.AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# model = transformers.AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).cuda()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [28]:
def tokenize_function(examples):
    tens =  tokenizer(examples["text"], padding="max_length", truncation=True, #return_tensors="pt"
                      )
    #tens.to('cuda')
    return tens

In [29]:
tokenized_attempt = data.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [46]:
text_dataset = TextDataset(text_data)

In [50]:
tokens = tokenizer(list(text_data["text"]), padding="max_length", truncation=True)
# tokenized_datasets = text_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True))



In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("imdb")

Reusing dataset imdb (/home/dstratton/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [41]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# tokenized_datasets = raw_datasets.map(tokenize_function)

Loading cached processed dataset at /home/dstratton/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-55e5d530d63920d7.arrow
Loading cached processed dataset at /home/dstratton/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-df60120efc8ea3cf.arrow
Loading cached processed dataset at /home/dstratton/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-c9da8fb9065ff393.arrow


In [42]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [30]:
from transformers import Trainer
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer", per_device_train_batch_size=2)
# trainer = Trainer(
#     model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
# )
trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_attempt
)

In [31]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 686
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 516


Step,Training Loss


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=516, training_loss=0.508307181587515, metrics={'train_runtime': 98.6954, 'train_samples_per_second': 20.852, 'train_steps_per_second': 5.228, 'total_flos': 541482551930880.0, 'train_loss': 0.508307181587515, 'epoch': 3.0})

In [54]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [55]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


{'eval_loss': 0.7528936266899109,
 'eval_accuracy': 0.857,
 'eval_runtime': 10.6475,
 'eval_samples_per_second': 93.919,
 'eval_steps_per_second': 5.917}

In [34]:
# element = next(iter(tokenized_datasets['train']))
# del element['label']
# del element['text']
# next(iter(raw_datasets['train']))
element = tokenizer(next(iter(raw_datasets['train']))['text'], return_tensors="pt")
element.to('cuda')
model(**element)
# next(iter(tokenized_datasets['train']))

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [50]:
import torch
torch.cuda.empty_cache()

In [49]:
import torch
torch.cuda.memory_summary(device=0, abbreviated=True)

