In [183]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers.tokenization_utils_base import BatchEncoding

import pandas as pd
from datasets import load_from_disk, concatenate_datasets, Dataset
from proj_utils import RegularizedCLSModel
from sklearn.decomposition import PCA

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import chart_studio
import chart_studio.plotly as py
import textwrap

import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
tl_model_name = "jcblaise/roberta-tagalog-base"
xlm_model_name = "xlm-roberta-base"
en_model_name = "bert-base-uncased"

## Running all examples through encoder

In [3]:
dataset_1 = load_from_disk('./full_data/dataset_1')

In [4]:
best_tl_model = RegularizedCLSModel(tl_model_name)
best_tl_model.load_state_dict(torch.load('./models/model_1/full_1e-05_alpha_10/model.pt'))

Some weights of the model checkpoint at jcblaise/roberta-tagalog-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at jcblaise/roberta-tagalog-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'cla

<All keys matched successfully>

In [5]:
best_model = best_tl_model.model.roberta
best_model.to('cuda');

In [6]:
def collate_fn(examples):
    def collate_helper(seq, pad_val):
        return pad_sequence([torch.tensor(l) for l in seq], padding_value=pad_val, batch_first=True)[:, :200]
    return BatchEncoding({
        'input_ids': collate_helper(examples[0], 1),
        'attention_mask': collate_helper(examples[1], 0),
    })

def process(examples):
    loader = DataLoader([examples['input_ids'], examples['attention_mask']], batch_size=16, collate_fn=collate_fn)
    outs = []
    with torch.no_grad():
        for ids in iter(loader):
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                ids = ids.to('cuda')
                out = best_model(**ids)['last_hidden_state'][:, 0]
            outs.append(out.clone().to('cpu'))
            del ids, out
    examples['encoded'] = torch.cat(outs)
    return examples

In [7]:
dataset_1 = dataset_1.map(process, batched=True, batch_size=16)

  0%|          | 0/3588 [00:00<?, ?ba/s]

  0%|          | 0/399 [00:00<?, ?ba/s]

In [21]:
dataset = concatenate_datasets([dataset_1['train'], dataset_1['test']])

In [22]:
enc = dataset['encoded']

# Fitting PCA model to encoded predictions

In [23]:
pca = PCA()
pca.fit(enc)

In [24]:
pca_data = pca.transform(enc)

In [33]:
pca_data = Dataset.from_dict({'pca_data': pca_data})

In [34]:
dataset = concatenate_datasets([dataset, pca_data], axis=1)

In [57]:
def add_pca_3d(examples):
    examples['pca_3d'] = torch.tensor(examples['pca_data'])[:, :3]
    return examples

In [58]:
dataset = dataset.map(add_pca_3d, batched=True)

  0%|          | 0/64 [00:00<?, ?ba/s]

## Plotting pca data

In [65]:
subset = dataset.select(range(100))

In [105]:
dataset

Dataset({
    features: ['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone', 'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions', 'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count', 'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video', 'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt', 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src', 'trans_dest', 'hashtags_lower', 'anti', 'pro', 'labeled', 'processed', 'labels', 'text', 'en_translation', 'lang', 'tl_translation', '__index_level_0__', 'input_ids', 'attention_mask', 'encoded', 'pca_data', 'pca_3d'],
    num_rows: 63779
})

In [168]:
def plot(dataset):
    df = pd.DataFrame(dataset['pca_3d'], columns=['x', 'y', 'z'])
    df['label'] = dataset['labels']
    df['label'] = df['label'].map({0: "Anti-Marcos", 1: "Pro-Marcos"})
    df['retweets'] = torch.tensor(dataset['retweets_count'])
    df['size'] = (torch.tensor(dataset['retweets_count']) + 1)**0.5 
    df['tweet'] = dataset['tweet']
    df['tweet'] = df['tweet'].apply(lambda txt: '<br>'.join(textwrap.wrap(txt, width=40)))
    
    layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )
    
    hover_data={'x': False, 'y': False, 'z': False, 'size': False, 'retweets': True, 'tweet': True, 'label': True}
    
    hovertemplate=\
        "%{customdata[2]}<br><br>" +\
        "Retweets: %{customdata[1]}<br>"
    
    fig = px.scatter_3d(df, x='x', y='y', z='z', size='size', color='label', size_max=40, hover_data=hover_data)
    fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False)
    fig.update_traces(hovertemplate=hovertemplate)
    
    fig.update_layout(
    hoverlabel=dict(
        font_size=12,
        font_family="Overpass"
        )
    )

    return fig

In [169]:
fig = plot(dataset.select(range(10000)))
fig.show()

In [181]:
# ## If plot is small enough, you can use this
# username = os.getenv('PLOTLY_USERNAME')
# api_key = os.getenv('PLOTLY_API_KEY')
# chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
# py.plot(fig, filename='elections_10k')

In [185]:
## Otherwise, generate HTML then embed somewhere else
pio.write_html(fig, file="index.html")