In [4]:
import torch
import numpy as np
import pandas as pd

from model import PrefixNet

In [None]:
from sklearn.manifold import TSNE, Isomap, MDS
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
# setting device on GPU if available, else CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

cfg= {'batch_size':8,'prefix_len':5,'embed_size_per_token':300,'speaker_size':106,'epoch':12,'encoded':'encoded_st_p_ids','freeze_gpt':False}
name_col= 'st_p' # st_p, parl_part, encoded_bioguide_ids

model = PrefixNet(cfg)

model_name=f'24may_prefix_tuning_stp_prlen{cfg["prefix_len"]}_embsize{cfg["embed_size_per_token"]}_speaksize{cfg["speaker_size"]}_maxseqlen256_batch8_8_epoch{cfg["epoch"]}.pt'
checkpoint = torch.load('/cluster/scratch/goezsoy/nlp_lss_checkpoints/'+model_name,map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

X = model.embedding_layer.weight.detach().numpy()


X_embedded_pca = PCA(n_components=2).fit_transform(X)
X_embedded_tsne = TSNE(n_components=2).fit_transform(X)
X_embedded_mds = MDS(n_components=2).fit_transform(X)
X_embedded_isomap = Isomap(n_components=2).fit_transform(X)

speaker_labels = np.arange(0,cfg['speaker_size'])
processed_df = pd.read_csv('/cluster/scratch/goezsoy/nlp_lss_datasets/processed_df.csv')
meta_df= processed_df[processed_df[cfg['encoded']].isin(speaker_labels)].drop_duplicates(subset=cfg['encoded'])

#first plot
group = meta_df.sort_values(by=cfg['encoded'],ascending=True, inplace=False).term_party.values
cdict = {'Democrat': 'blue', 'Republican': 'red', 'Independent':'yellow'}

fig, ax = plt.subplots(figsize=(12,12))

for g in np.unique(group):
    ix = np.where(group == g)
    ax.scatter(X_embedded_tsne[ix,0], X_embedded_tsne[ix,1], c = cdict[g], label = g, s = 100)

for i, idx in enumerate(list(speaker_labels)):
    ax.annotate(meta_df[meta_df[cfg['encoded']]==idx][name_col].values[0], (X_embedded_tsne[i,0], X_embedded_tsne[i,1]))
ax.legend()
ax.set_title('tsne')

plt.show()

#second plot
fig2, ax2 = plt.subplots(figsize=(12,12))

for g in np.unique(group):
    ix = np.where(group == g)
    ax2.scatter(X_embedded_pca[ix,0], X_embedded_pca[ix,1], c = cdict[g], label = g, s = 100)

for i, idx in enumerate(list(speaker_labels)):
    ax2.annotate(meta_df[meta_df[cfg['encoded']]==idx][name_col].values[0], (X_embedded_pca[i,0], X_embedded_pca[i,1]))
ax2.legend()
ax2.set_title('pca')

plt.show()


#third plot
fig3, ax3 = plt.subplots(figsize=(12,12))

for g in np.unique(group):
    ix = np.where(group == g)
    ax3.scatter(X_embedded_mds[ix,0], X_embedded_mds[ix,1], c = cdict[g], label = g, s = 100)

for i, idx in enumerate(list(speaker_labels)):
    ax3.annotate(meta_df[meta_df[cfg['encoded']]==idx][name_col].values[0], (X_embedded_mds[i,0], X_embedded_mds[i,1]))
    
ax3.legend()
ax3.set_title('mds')

plt.show()

#fourth plot
fig4, ax4 = plt.subplots(figsize=(12,12))

for g in np.unique(group):
    ix = np.where(group == g)
    ax4.scatter(X_embedded_isomap[ix,0], X_embedded_isomap[ix,1], c = cdict[g], label = g, s = 100)

for i, idx in enumerate(list(speaker_labels)):
    ax4.annotate(meta_df[meta_df[cfg['encoded']]==idx][name_col].values[0], (X_embedded_isomap[i,0], X_embedded_isomap[i,1]))
    
ax4.legend()
ax4.set_title('isomap')

plt.show()

In [25]:
# SAVE

# add perplexity score for each real sentence
import pandas as pd
import re

max_word_count = 256

valid_df = pd.read_csv('/cluster/scratch/goezsoy/nlp_lss_datasets/toy_processed_df_valid.csv')
temp_valid_df = valid_df.iloc[:100]


real_fake_df = pd.DataFrame(columns=['speech','label','perplexity'])

real_fake_df['speech'] = temp_valid_df['speech'].map(lambda row: ' '.join(row.split()[:max_word_count]))
real_fake_df['label'] = 1

generated_texts_path = '/cluster/home/goezsoy/K2T/results/50_keywordsets_eval/finetunedgptmed_valid100_trick/Result_w_5.0_nBeams_1_nGenSent_256_nWordsPerSent_1_topP_0.9_WC_glove_maxSENTENCES.txt'

file = open(generated_texts_path, 'r')

temp_speech = None
temp_perplexity = None

flag_speech = False
flag_perplexity = False

for line in file:
    if line != '\n' and re.search("\ASuccess_rate:", line) is None and re.search("#.:", line) is None:
        if re.search("\APerplexity:", line) is not None:
            temp_perplexity = line.split()[-1]
            flag_perplexity = True
        else:
            # remove <|endoftext|> tokens generated by k2t
            temp_speech = line.replace('<|endoftext|>','')

            # if initial text is <|endoftext|> removing it leads to
            # extra space at the start of sentence, so remove it
            if temp_speech[0] == ' ':
                temp_speech = temp_speech[1:]
            flag_speech = True
    
    if flag_speech and flag_perplexity:
        temp_df = pd.DataFrame.from_dict({'speech':[temp_speech],'label':[0],'perplexity':[temp_perplexity]})
        real_fake_df = pd.concat([real_fake_df,temp_df], ignore_index=True)

        flag_speech = False
        flag_perplexity = False

file.close()
real_fake_df.to_csv('/cluster/scratch/goezsoy/nlp_lss_datasets/real_fake_df.csv', index=False)

In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import Dataset
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifi

In [2]:
def tokenize_function(row):

    tokenizer_dict = tokenizer(row['speech'])
    tokenizer_dict['labels'] = row['label']

    return {**tokenizer_dict}


def prepare_dataloader(df):

    dataset = Dataset.from_pandas(df)

    dataset = dataset.map(tokenize_function, batched=True)

    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer,return_tensors='pt')

    dataloader = torch.utils.data.DataLoader(dataset,collate_fn=data_collator, batch_size=32, shuffle=True, drop_last=False)

    return dataloader

In [5]:
real_fake_df = pd.read_csv('/cluster/scratch/goezsoy/nlp_lss_datasets/real_fake_df.csv')

real_fake_df['speech'] = real_fake_df['speech'].map(lambda row: row.lower())

# train vs test split
X_train_valid, X_test, _, _ = train_test_split(real_fake_df.index, real_fake_df['label'], test_size=0.2, random_state=0, stratify=real_fake_df['label'])

real_fake_train_valid_df = real_fake_df.iloc[X_train_valid].reset_index(drop=True)
real_fake_test_df = real_fake_df.iloc[X_test].reset_index(drop=True)

# train vs valid split
X_train, X_valid, _, _ = train_test_split(real_fake_train_valid_df.index, real_fake_train_valid_df['label'], test_size=0.1, random_state=0, stratify= real_fake_train_valid_df['label'])

real_fake_train_df = real_fake_train_valid_df.iloc[X_train].reset_index(drop=True)
real_fake_valid_df = real_fake_train_valid_df.iloc[X_valid].reset_index(drop=True)


train_dataloader = prepare_dataloader(real_fake_train_df)
valid_dataloader = prepare_dataloader(real_fake_valid_df)
test_dataloader = prepare_dataloader(real_fake_test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 1/1 [00:00<00:00,  1.05ba/s]
100%|██████████| 1/1 [00:00<00:00, 24.23ba/s]
100%|██████████| 1/1 [00:00<00:00,  7.65ba/s]


In [6]:
valid_dataloader

<torch.utils.data.dataloader.DataLoader at 0x2afda3c48be0>

In [7]:
for b in valid_dataloader:
    print(b)
    break

{'input_ids': tensor([[ 101,  119, 1231,  ...,    0,    0,    0],
        [ 101, 1191,  178,  ...,    0,    0,    0],
        [ 101,  119,  182,  ...,    0,    0,    0],
        ...,
        [ 101,  178, 3606,  ...,    0,    0,    0],
        [ 101,  119,  182,  ..., 1106, 2936,  102],
        [ 101,  119,  182,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0])}


In [22]:
real_fake_train_df[['speech','label']].values

array([[". mr. speaker, i yield 2 minutes to the gentleman from texas (mr. poe), the distinguished republican from the second congressional district of texas.   investments in america for workers are another way that we can help create jobs for people across the nation, and today we will again offer in this chamber an important investment in america in the fields of science and technology that are not being fully invested in america and that are mexico's property to do with doing and how we do things in their country and how we take them for ourselves. we are once again investing in our scientists and our technology and our educators who have this great opportunity to make a difference and actually help michelle, who was one of these scientists at culver laboratories, in making her life better and better so that there are jobs here in our country and, hopefully, in her country murderers won't need to come out of the woodwork to terrorize the people who are being victimized here in the 

In [19]:
ground_truth = b['labels']

logits = model(**b).logits
preds = torch.argmax(logits,axis=1)

In [20]:
from sklearn.metrics import accuracy_score

accuracy_score(ground_truth.numpy(), preds.numpy())

0.5

In [24]:
mnp.c

AttributeError: 'numpy.ndarray' object has no attribute 'append'