In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My\ Drive/Text-Mining-Code
# !git clone https://github.com/Smolky/hahackathon-2021
# %cd hahackathon-2021/datasets/
!pip install torch transformers evaluate

Mounted at /content/drive
/content/drive/My Drive/Text-Mining-Code
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

## EDA

In [None]:
import pandas as pd

df = pd.read_csv("./datasets/hahackathon_train.csv")
df

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.20
1,2,A man inserted an advertisement in the classif...,1,2.50,1.0,1.10
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.40
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.00
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.10
...,...,...,...,...,...,...
7995,7996,Lack of awareness of the pervasiveness of raci...,0,,,0.25
7996,7997,Why are aspirins white? Because they work sorry,1,1.33,0.0,3.85
7997,7998,"Today, we Americans celebrate our independence...",1,2.55,0.0,0.00
7998,7999,How to keep the flies off the bride at an Ital...,1,1.00,0.0,3.00


In [None]:
len(df.index)

8000

In [None]:
print(len(df[~df.isnull().any(axis=1)].index))
df.fillna(0, inplace=True)

4932


In [None]:
df['humor_controversy'].value_counts(normalize=True)

0.0    0.500203
1.0    0.499797
Name: humor_controversy, dtype: float64

1. Analyze Data
2. Pytorch & Huggingface transformers
3. we will try to re-implement a few papers - their models [2-3]
4. Formulate our approach ^- based on papers we read 
5. code model -> train -> test/calculate metrics

## Datasets

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset

class HumourDataset(Dataset):
  def __init__(self, csv_path, 
               columns=None, fill_nan=False, drop_nan=False):
    self.df = pd.read_csv(csv_path)
    self.df = self.df[~self.df.isnull().any(axis=1)] if drop_nan else self.df
    self.df = self.df.fillna(0) if fill_nan else self.df
    self.columns = columns
  
  def __len__(self):
    return len(self.df.index)
  
  def __getitem__(self, idx):
    return self.df.iloc[idx, self.df.columns.get_loc(self.columns)] if self.columns is not None else None


In [None]:
from transformers import AutoTokenizer

class HumourTransformerDataset(HumourDataset):
  def __init__(self, transformer_name, csv_path, label_column=None, text_column="text"):
    super().__init__(csv_path, label_column, fill_nan=True)

    self.label_column = label_column
    text_data = self.df[text_column].tolist()
    self.tokenizer = AutoTokenizer.from_pretrained(transformer_name)
    self.encodings = self.tokenizer(text_data, truncation=True, padding="max_length")
  
  def __len__(self):
    return super().__len__()
  
  def __getitem__(self, idx):
    item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
    if(self.label_column is not None):
      label = super().__getitem__(idx)
      item['labels'] = int(int(label) == 1)
    return item


## Transformer

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

transformer_name = "bert-base-cased"

model = AutoModelForSequenceClassification.from_pretrained(transformer_name, num_labels=2)
training_args = TrainingArguments(output_dir="./log_test_trainer",
                                  per_device_train_batch_size=16,
                                  evaluation_strategy="epoch")

train_dataset = HumourTransformerDataset(
    transformer_name, 
    "./datasets/hahackathon_train.csv",
    "is_humor")

eval_dataset = HumourTransformerDataset(
    transformer_name, 
    "./datasets/hahackathon_dev.csv")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,0.27,No log
2,0.1036,No log
3,0.0286,No log


TrainOutput(global_step=1500, training_loss=0.1340438067118327, metrics={'train_runtime': 2221.1234, 'train_samples_per_second': 10.805, 'train_steps_per_second': 0.675, 'total_flos': 6314665328640000.0, 'train_loss': 0.1340438067118327, 'epoch': 3.0})

In [None]:
trainer.save_model("./saved_model/")

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained("./saved_model/", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# test_data = HumourDataset("./datasets/hahackathon_test.csv", "text")
encoding = tokenizer(test_data[:].tolist(), truncation=True, padding="max_length")
outputs = model(encoding)

test_data["predictions"] =  outputs.logits.argmax(-1)
test_data

AttributeError: ignored