In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 8.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 73.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.6 MB/s 
Collecting aiohttp
  Downlo

In [None]:
import torch
from transformers import BertModel, BertConfig, PretrainedConfig, PreTrainedModel, AutoModel, AutoConfig
from typing import List, Optional, Tuple, Union
from transformers.modeling_outputs import TokenClassifierOutput,SequenceClassifierOutput
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, BCELoss
import torch.nn as nn

# weight = torch.FloatTensor(compute_class_weight(class_weight="balanced", classes=[0, 1], y=df_train["truthClass"])).to('cpu')
# WEIGHT = torch.cuda.FloatTensor([weight[1]/weight[0]])

class ClickbaitConfig(PretrainedConfig):
    def __init__(
        self,
        model_type: str = "bert",
        pretrained_model: str = "bert-base-uncased",
        num_labels: int = 2,
        dropout: float = 0.1,
        inner_dim1: int = 256,
        inner_dim2: int = 32, 
        max_length: int = 512,
        load_pretrained: bool = True,
        freeze_bert: bool = True,
        **kwargs
    ):
        super(ClickbaitConfig, self).__init__(num_labels=num_labels, **kwargs)
        self.model_type = model_type
        self.pretrained_model = pretrained_model
        self.dropout = dropout
        self.inner_dim1 = inner_dim1
        self.inner_dim2 = inner_dim2
        self.max_length = max_length
        self.load_pretrained = load_pretrained
        self.freeze_bert = freeze_bert


class BertClickbaitClassifier(PreTrainedModel):
    """
      Taken and extended from BertforSequenceClassification : https://github.com/huggingface/transformers/blob/v4.19.2/src/transformers/models/bert/modeling_bert.py#L1508
    """
    config_class = ClickbaitConfig
    def __init__(self, config: ClickbaitConfig):
        super(BertClickbaitClassifier, self).__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        # self.bert_config = BertConfig.from_pretrained(config.pretrained_model)
        self.bert_config = AutoConfig.from_pretrained(config.pretrained_model)

        # self.bert = BertModel(self.bert_config)
        self.bert = AutoModel.from_pretrained(config.pretrained_model, config=self.bert_config)
        if config.load_pretrained:
            print("Load pretrained weights from {}".format(config.pretrained_model))
            self.bert = self.bert.from_pretrained(config.pretrained_model)
        if config.freeze_bert:
            print("Freeze weights in the BERT model. Just the classifier will be trained")
            for param in self.bert.parameters():
                param.requires_grad = False

        self.linear_1 = nn.Linear(self.bert.config.hidden_size, config.inner_dim1)
        self.dropout_1 = nn.Dropout(config.dropout) 
        self.relu_1 = nn.ReLU()
        self.dropout_2 = nn.Dropout(config.dropout)
        self.linear_2 = nn.Linear(config.inner_dim1, config.inner_dim2)
        self.relu_2 = nn.ReLU()
        self.dropout_3 = nn.Dropout(config.dropout)
        self.classifier = nn.Linear(config.inner_dim2, config.num_labels)
        self.sigmoid = nn.Sigmoid()


    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        output = outputs[0][:,0,:]

        x = self.dropout_1(output)
        x = self.linear_1(x)
        x = self.relu_1(x)
        x = self.dropout_2(x)
        x = self.linear_2(x)
        x = self.relu_2(x)
        x = self.dropout_3(x)

        logits = self.classifier(x)
        logits = self.sigmoid(logits)

        loss = None
        if labels is not None:
            # loss_fct = BCELoss(weight=WEIGHT)
            loss_fct = BCELoss()
            labels = 1.0*labels
            loss = loss_fct(logits.view(-1), labels.view(-1))
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

In [None]:
from transformers import AutoModel,AutoTokenizer
classifier_model_path = "drive/MyDrive/nlp_lss_data/mpnet_clickbait_classification_maxlen25/checkpoint-1464"
device='cuda'
#Loading classifier
classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_path, load_best_model_at_end=True)
classifier_model = BertClickbaitClassifier.from_pretrained(classifier_model_path).to(device)


You passed along `num_labels=2` with an incompatible id to label map: {'0': 'LABEL_0'}. The number of labels wil be overwritten to 1.


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Load pretrained weights from sentence-transformers/all-mpnet-base-v2


In [None]:
from datasets import load_dataset,DatasetDict,Dataset
# from datasets import 
from transformers import AutoTokenizer
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
import torch
import pandas as pd 
webis_train = "https://ml-coding-test.s3.eu-west-1.amazonaws.com/webis_train.csv"
webis_test = "https://ml-coding-test.s3.eu-west-1.amazonaws.com/webis_test.csv"
df_train = pd.read_csv(webis_train)
df_test = pd.read_csv(webis_test)

# Filter only importante variables
# df_train = df_train[['postText', 'truthClass']]
# df_test = df_test[['postText', 'truthClass']]


df_train['truthClass'] = pd.factorize(df_train['truthClass'])[0]
df_test['truthClass'] = pd.factorize(df_test['truthClass'])[0]

In [None]:
def rate_title(input_text, model, tokenizer, device='cuda'):
  # input_text = {
  #                 "postText": input_text['postText'],
  #                 "truthClass" : input_text['truthClass']
  #              }
  tokenized_input = preprocess_function_title_only_classification(input_text,tokenizer=tokenizer)
  # print(tokenized_input.items())
  dict_tokenized_input = {k : torch.tensor([v]).to(device) for k,v in tokenized_input.items() if k != 'labels'}
  predicted_class = float(model(**dict_tokenized_input).logits)
  actual_class = input_text['truthClass']

  # print(predicted_class, actual_class)
  return {'predicted_class' : predicted_class}

def preprocess_function_title_only_classification(examples,tokenizer=None):
    model_inputs = tokenizer(examples['postText'], padding="longest", truncation=True, max_length=25)
      
    model_inputs['labels'] = examples['truthClass']

    return model_inputs



In [None]:
!ls drive/MyDrive/nlp_lss_data/df_clickbait_test_condition_lambda_25.0_subset_300

drive/MyDrive/nlp_lss_data/df_clickbait_test_condition_lambda_25.0_subset_300


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('drive/MyDrive/nlp_lss_data/df_clickbait_test_condition_lambda_25.0_subset_300')

In [None]:
df[df['truthMean'] < 0.5]

Unnamed: 0,original_title,clickbait_title,article_content,truthMean
0,Johnny Manziel Says Top Pick in Draft Myles Ga...,"Former Texas A and Sons ""super bront guy"" ex...","[""Johnny Manziel approves of the Cleveland Bro...",0.000000
1,Fabio: California Is a 'Mess' Because of Liber...,"Fabio, the stars' famous love interest, share...","['Fabio, the Italian-born male model who has a...",0.066667
3,Mitchell Trubisky booed at United Center durin...,Watch the first 48 slides of all the 2017 NFL...,['CHICAGO -- New Bears quarterback Mitchell Tr...,0.266667
4,It's Not Enough to Give Employees Flexible Wor...,While many companies nowguarantee full-calori...,"['Six months after her baby was born, Amanda S...",0.133333
5,"Out Of 15 Lakh Registered Companies In India, ...",The Income-Tax authorities have crackerved as...,['As many as 8-9 lakh registered companies are...,0.200000
...,...,...,...,...
294,The moment Stockholm police catch the third be...,A selection of talking points from the storm ...,['This is the moment Swedish police snared a s...,0.200000
295,MTV gets rid of gender-specific categories for...,MTV has scrapped gender-trepidated prizes suc...,['Viewers of the MTV Movie & TV Awards next mo...,0.000000
296,"Hair Loss, Warts, and Scars: Scientists ID the...","People's skin is surprisingly complex, but so...","['More', 'As a whole, movie villains have a ce...",0.400000
297,88-year-old man honors wife's legacy by making...,Click here to read the full story.6753167531....,['What started as a hobby his wife enjoyed has...,0.133333


In [None]:
df[['original_title','clickbait_title', 'truthMean']].to_latex('drive/MyDrive/nlp_lss_data/latex_clickbait_test_condition_lambda_25.0_subset_300.txt')

In [None]:
np.array([1.0 / len(x) for x in proposed_sentences]) # * np.array([10**i for i in range(11)])

array([9.34579439e-03, 8.84955752e-02, 1.03092784e+00, 1.05263158e+01,
       7.87401575e+01, 7.35294118e+02, 1.06382979e+04, 9.00900901e+04,
       7.75193798e+05, 7.63358779e+06, 1.08695652e+08])

In [None]:
# condition_lambdas = [0.0, 1.0, 5.0, 10.0, 20.0, 30.0]
condition_lambdas = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] # 50.0, 100.0, 200.0]
subset = 25
subset = 100


for i in range(25):
  # i = 
  print('++' * 20)
  print(df_test.iloc[i]['truthMean'], df.iloc[i]['original_title'])

  proposed_sentences = []
  proposed_weights = []
  for condition_lambda in condition_lambdas:

    # path_csv = f'drive/MyDrive/nlp_lss_data/df_clickbait_test_lambda_{condition_lambda}_subset_25'
    path_csv = f'drive/MyDrive/nlp_lss_data/df_clickbait_test_pegasus_webis_lambda_{condition_lambda}_subset_{subset}.csv'
    df = pd.read_csv(path_csv)
    score = rate_title(input_text = {
        'postText' : df.iloc[i]['clickbait_title'],
        'truthClass' : df_test.loc[i]['truthClass'],
    },
      model = classifier_model,
      tokenizer = classifier_tokenizer,
    )['predicted_class']
    print(round(score,2), '_', condition_lambda, '__', df.iloc[i]['clickbait_title']) #, df.iloc[24]['clickbait_score'])

    proposed_weights.append(round(score,2))
    proposed_sentences.append(df.iloc[i]['clickbait_title'])

  sentence_len_weights = np.array([1.0 / len(x) for x in proposed_sentences])
  clickbait_scores_combined = np.array(proposed_weights) * sentence_len_weights
  chosen_index = np.argmax(np.random.multinomial(n=1, pvals=clickbait_scores_combined/clickbait_scores_combined.sum()))
  print(chosen_index)
  print(proposed_weights[chosen_index], proposed_sentences[chosen_index])


++++++++++++++++++++++++++++++++++++++++
0.0 Johnny Manziel Says Top Pick in Draft Myles Garrett Will 'Do Wonders' for Browns
0.01 _ 0.0 __  Johnny Manziel  Praises Myles Garrett For the Browns, Honors College Alumn   to Ben & Jerry ...
0.02 _ 1.0 __  Johnny Manziel Says Browns' Myles Garrett 'Will Do Wonders'  That!....
0.01 _ 2.0 __  Johnny Manziel 'Feeling Good' about Myles Garrett, The Browns' 2017 Draft Pick  .
0.01 _ 3.0 __  Johnny Manziel Gets Candid About Myles Garrett and His NFL Pick: 'He'll Do Wonders For The Team'... 
0.01 _ 4.0 __  Johnny Manziel Approved Of Draft Result In Myles Garrett Picking No. 1... Just The Way He Wanted To Be<unk>...
0.01 _ 5.0 __  Johnny Manziel Says Myles Garrett Will Do Wonders for the Cleveland Browns ! And
0.01 _ 6.0 __  Johnny Manziel, Texas A&M Alum, approves of Browns top pick 
0.01 _ 7.0 __  Johnny Manziel Praises Myles Garrett As Cleveland Browns Ridiculous Pick Ex-Borts QB Players,
0.01 _ 8.0 __  Johnny Manziel Has 'Warmed Up' To Myles Ga

# PPLM

In [None]:
stepsize_array = [0.4, 0.8, 1.2, 1.6, 2.0, 10.0] #, 20.0]
subset = 25

for k in range(25):
  # k = 2
  pert_str_array = []

  print(round(df_test.iloc[k]['truthMean'],3), df_test.iloc[k]['targetTitle'])

  proposed_sentences = []
  proposed_weights = []
  # proposed_l = []
  proposed_lambdas = []
  for stepsize in stepsize_array:

      df = pd.read_csv(f'drive/MyDrive/nlp_lss_data/df_clickbait_test_stepsize_{stepsize}_subset_{subset}.csv')
      # print(df.iloc[24]) #, df.iloc[24]['clickbait_score'])

      print('==' * 20)
      print(f'stepsize : {stepsize}')

      for i in range(5):

        pert_str = f'perturbed_{i}'


        score = rate_title(input_text = {
            'postText' : df.iloc[k][pert_str],
            'truthClass' : df_test.iloc[k]['truthMean'],
        },
          model = classifier_model,
          tokenizer = classifier_tokenizer,
        )['predicted_class']


        # print(f'score {round(score,2)}, perturbed_{i} : {df.iloc[k][pert_str]}')

        # pert_str_array.append(df.iloc[k][pert_str])

        proposed_weights.append(round(score,2))
        proposed_sentences.append(df.iloc[k][pert_str])
        proposed_lambdas.append(condition_lambda)
  sentence_len_weights = np.array([1.0 / len(x) for x in proposed_sentences])
  clickbait_scores_combined = np.array(proposed_weights) * sentence_len_weights
  chosen_index = np.argmax(np.random.multinomial(n=1, pvals=clickbait_scores_combined/clickbait_scores_combined.sum()))
  print(chosen_index)
  print(proposed_lambdas[chosen_index], proposed_weights[chosen_index], proposed_sentences[chosen_index])
  print('++++' * 10)
  # sorted(pert_str_array, key=len)

0.0 Johnny Manziel Says Top Pick in Draft Myles Garrett Will 'Do Wonders' for Browns
stepsize : 0.4
stepsize : 0.8
stepsize : 1.2
stepsize : 1.6
stepsize : 2.0
stepsize : 10.0
6
10.0 0.01  Johnny Manziel said he was happy for the Browns' top pick during a brief but interview with TMZ.
++++++++++++++++++++++++++++++++++++++++
0.067 Fabio: California Is a 'Mess' Because of Liberal Policies  Insider
stepsize : 0.4
stepsize : 0.8
stepsize : 1.2
stepsize : 1.6
stepsize : 2.0
stepsize : 10.0
19
10.0 0.04  A well-known American model has called out the Democrats for putting the police in a "really, really bad spot" while promoting the state of Los Angeles.--
++++++++++++++++++++++++++++++++++++++++
1.0 Jimmy Butler wants to return, hopes Bulls keep Rajon Rondo
stepsize : 0.4
stepsize : 0.8
stepsize : 1.2
stepsize : 1.6
stepsize : 2.0
stepsize : 10.0
4
10.0 0.01  Chicago Bulls All-Stars Jimmy Butler and Dwyane Wade are both expected to be dealt this summer but Rajon Rondo is expected to be pic