# Setup Notebook

## Mount to Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Install dependencies

In [2]:
! pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[?25l[K     |█▎                              | 10 kB 20.6 MB/s eta 0:00:01[K     |██▋                             | 20 kB 23.5 MB/s eta 0:00:01[K     |████                            | 30 kB 11.3 MB/s eta 0:00:01[K     |█████▎                          | 40 kB 8.7 MB/s eta 0:00:01[K     |██████▋                         | 51 kB 4.7 MB/s eta 0:00:01[K     |████████                        | 61 kB 5.6 MB/s eta 0:00:01[K     |█████████▏                      | 71 kB 5.7 MB/s eta 0:00:01[K     |██████████▌                     | 81 kB 5.5 MB/s eta 0:00:01[K     |███████████▉                    | 92 kB 6.2 MB/s eta 0:00:01[K     |█████████████▏                  | 102 kB 5.3 MB/s eta 0:00:01[K     |██████████████▌                 | 112 kB 5.3 MB/s eta 0:00:01[K     |███████████████▉                | 122 kB 5.3 MB/s eta 0:00:01[K     |█████████████████               | 133 kB 5

## Load Imports

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from simpletransformers.classification import ClassificationModel

# Configuration File

In [2]:
! nvidia-smi

Thu Mar 24 16:34:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [41]:
config = {'data': {'subset': 0.1,
                   'split': 0.8,
                   'columns': ['Headline', 'articleBody', 'Stance'], # ['Headline', 'articleBody', 'related']
                  },
    
          'training':{'learning_rate':1e-5,
                      'num_train_epochs': 5,
                      'reprocess_input_data': True,
                      'overwrite_output_dir': True,
                      'process_count': 10,
                      'evaluate_during_training_steps': False,
                      'train_batch_size': 32,
                      'eval_batch_size': 4,
                      'max_seq_length': 512,
                      'fp16': True,
                      'early_stopping': True
                      }
}

# Train Model

## Load Data

In [42]:
def read_data(path: str, name: str):
    '''Reads csv file
    
    Args:
        path (str): parent directory to file
        name (str): type of csv to load (train or test)
        
    Returns:
        pandas.core.frame.DataFrame of joined bodies and stances        
    '''
    df = pd.read_csv(f'{path}/{name}.csv')
    return df

In [43]:
def extract_columns(df: pd.core.frame.DataFrame, columns: list = ['Headline', 'articleBody', 'Stance'],
                    new_columns: list = ['text_a', 'text_b', 'labels']):
    '''Load certain columns of dataframe and transform to desired format
    
    Args:
        df (pd.core.frame.DataFrame): DataFrame of extracted columns
        columns (list): Column names to extract
        new_columns (list): New column names
    
    Returns:
        Extracted dataframe with new columns
    '''
    processed_df = df[columns]
    processed_df.columns = new_columns
    return processed_df

In [44]:
def encod_labels(df):
    '''Encod label strings to ints
    
    Args:
        df (pd.core.frame.DataFrame): DataFrame to encod labels
    
    Returns:
        df (pd.core.frame.DataFrame) with encoded labels
    '''
    labels = df['labels'].unique()
    label_dict = {}
    
    for i, l in enumerate(labels):
        label_dict[l] = i
    
    encoded_df = df.replace({"labels": label_dict})
    return encoded_df, label_dict

In [45]:
def load_data(path: str, name: str, config: dict):
    '''Read and process csv to desired format
    
    Args:
        path (str): parent directory to file
        name (str): type of csv to load (train or test)
        config (dict): configuration of data loading parameters
    
    Returns:
        Preprocessed data
    '''
    df = read_data(path, name)
    processed_df = extract_columns(df, config['columns'])
    encoded_df, l2e = encod_labels(processed_df)

    if name == 'train':
      print(int(len(encoded_df) * config['subset']))
      encoded_df = encoded_df.sample(int(len(encoded_df) * config['subset']))
      mask = np.random.rand(len(encoded_df)) < config['split']

      train = encoded_df[mask]
      val = encoded_df[~mask]
      return train, val, l2e

    return encoded_df, l2e

In [46]:
path = Path('gdrive/MyDrive/Dataset/msci/project/')

In [47]:
! ls {path}

test.csv  train.csv


In [48]:
train, val, l2e = load_data(path, 'train', config['data'])
test, _ = load_data(path, 'test', config['data'])
print(l2e)

4997
{'unrelated': 0, 'agree': 1, 'disagree': 2, 'discuss': 3}


In [49]:
print(f'{len(train)+len(val)}, {len(train)}, {len(val)}, {len(test)}')

4997, 4017, 980, 25413


In [50]:
train.head()

Unnamed: 0,text_a,text_b,labels
2203,Boko Haram denies ceasefire claim by Nigeria's...,Networks will do anything to squeeze in more c...,0
37809,Canada probes Michael Zehaf-Bibeau as possible...,"In case you hadn’t heard, a little movie calle...",0
33156,The rats eat Prada at Vogue NY office,(CNN) -- A company whose video chat service ap...,0
23434,NYC High School Student Nicknamed ‘Teen Wolf’ ...,Christian Bale will slip into a mock turtlenec...,0
12615,Kim Jong-Un 'Has Difficulty Walking And Needs ...,BEIJING/SEOUL (Reuters) - North Korean leader ...,3


In [51]:
test.head()

Unnamed: 0,text_a,text_b,labels
0,Ferguson riots: Pregnant woman loses eye after...,A RESPECTED senior French police officer inves...,0
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,Dave Morin's social networking company Path is...,0
2,A Russian Guy Says His Justin Bieber Ringtone ...,A bereaved Afghan mother took revenge on the T...,0
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",Hewlett-Packard is officially splitting in two...,0
4,Argentina's President Adopts Boy to End Werewo...,An airline passenger headed to Dallas was remo...,0


## Training Step

In [52]:
model = ClassificationModel('roberta', 'roberta-base', num_labels=len(l2e), args=config['training'])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [16]:
model.train_model(train)

  0%|          | 0/49972 [00:00<?, ?it/s]

Process ForkPoolWorker-28:
Process ForkPoolWorker-23:
Process ForkPoolWorker-24:
Process ForkPoolWorker-19:
Process ForkPoolWorker-26:
Process ForkPoolWorker-27:
Traceback (most recent call last):
Process ForkPoolWorker-25:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.ru

KeyboardInterrupt: ignored