# Setup Notebook

## Install Dependencies

In [None]:
! pip install simpletransformers

In [None]:
import os
import time

from contextlib import contextmanager

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

USE_APEX = False

if USE_APEX:
    with timer('install Nvidia apex'):
        # Installing Nvidia Apex
        os.system('git clone https://github.com/NVIDIA/apex; cd apex; pip install -v --no-cache-dir' + 
                  ' --global-option="--cpp_ext" --global-option="--cuda_ext" ./')
        os.system('rm -rf apex/.git') # too many files, Kaggle fails
        from apex import amp

## Load Imports

In [None]:
import numpy as np
import pandas as pd
import sklearn

from pathlib import Path
from simpletransformers.classification import ClassificationModel

# Configuration File

In [None]:
config = {'data': {'subset': 1,
                   'split': 1,
                   'columns': ['Headline', 'articleBody', 'Stance'],
                  #  'columns': ['Headline', 'articleBody', 'related'],
                  },
          
          'model': {
                    'model_type': 'roberta',
                    'model_name': 'roberta-large', #roberta-large
#                     'model_type': 'bert',
#                     'model_name': 'bert-base-uncased',
                   },
    
          'training':{'learning_rate':1e-5,
                      'num_train_epochs': 10,
                      'reprocess_input_data': True,
                      'overwrite_output_dir': True,
                      'process_count': 10,
                      'evaluate_during_training_steps': False,
                      'train_batch_size': 8,
                      'eval_batch_size': 8,
                      'max_seq_length': 512,
                      'fp16': False,
                      'early_stopping': True,
                      'save_steps': -1
                      }
}

In [None]:
def read_data(path: str, name: str):
    '''Reads csv file
    
    Args:
        path (str): parent directory to file
        name (str): type of csv to load (train or test)
        
    Returns:
        pandas.core.frame.DataFrame of joined bodies and stances        
    '''
    df = pd.read_csv(f'{path}/{name}.csv')
    return df

In [None]:
def extract_columns(df: pd.core.frame.DataFrame, columns: list = ['Headline', 'articleBody', 'Stance'],
                    new_columns: list = ['text_a', 'text_b', 'labels']):
    '''Load certain columns of dataframe and transform to desired format
    
    Args:
        df (pd.core.frame.DataFrame): DataFrame of extracted columns
        columns (list): Column names to extract
        new_columns (list): New column names
    
    Returns:
        Extracted dataframe with new columns
    '''
    processed_df = df[columns]
    processed_df.columns = new_columns
    return processed_df

In [None]:
# def encod_labels(df: pd.core.frame.DataFrame, label_dict: dict):
#     '''Encod label strings to ints
    
#     Args:
#         df (pd.core.frame.DataFrame): DataFrame to encod labels
#         label_dict (dict): label to encoding dictionary
    
#     Returns:
#         df (pd.core.frame.DataFrame) with encoded labels
#     '''
#     encoded_df = df.replace({"labels": label_dict})
#     return encoded_df

In [None]:
def load_data(path: str, name: str, l2e:dict, config: dict):
    '''Read and process csv to desired format
    
    Args:
        path (str): parent directory to file
        name (str): type of csv to load (train or test)
        config (dict): configuration of data loading parameters
    
    Returns:
        Preprocessed data
    '''
    df = read_data(path, name)
    processed_df = extract_columns(df, config['columns'])
    encoded_df = processed_df.replace({"labels": l2e})

    if name == 'train':
        encoded_df = encoded_df.sample(int(len(encoded_df) * config['subset']))
        mask = np.random.rand(len(encoded_df)) < config['split']

        train = encoded_df[mask]
        val = encoded_df[~mask]
        return train, val

    return encoded_df


In [None]:
path = Path('../input/fnc-1/')

! ls {path}

In [None]:
l2e = {'agree': 0, 'disagree':1, 'discuss':2, 'unrelated':3}
e2l = {v:k for k,v in l2e.items()}

train, val = load_data(path, 'train', l2e, config['data'])
test = load_data(path, 'test', l2e, config['data'])

print(l2e, e2l)

In [None]:
print(f'{len(train)+len(val)}, {len(train)}, {len(val)}, {len(test)}')

In [None]:
train.head()

In [None]:
train.head()

## Training Step

In [None]:
model = ClassificationModel(config['model']['model_type'], config['model']['model_name'], num_labels=len(l2e), args=config['training'])

In [None]:
model.train_model(train)

## Model Evaluation

In [None]:
def process_model_output(model_outputs: np.ndarray, e2l: dict):
    '''Processes model output and converts to label format
    
    Args:
        model_outputs (np.ndarray): Model output after activation layer
        e2l (dict): encoding mapping to label
    
    Returns:
        list of model predictions as labels
    '''
    prediction = np.argmax(model_outputs, axis=1)
    output = [e2l[p] for p in prediction]
    
    return output

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(train, acc=sklearn.metrics.accuracy_score)

In [None]:
result

In [None]:
LABELS = [k for k,_ in l2e.items()]
LABELS_RELATED = ['unrelated','related']
RELATED = LABELS[0:3]

In [None]:
def score_submission(gold_labels, test_labels):
    score = 0.0
    cm = [[0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]]

    for i, (g, t) in enumerate(zip(gold_labels, test_labels)):
        g_stance, t_stance = g, t
        if g_stance == t_stance:
            score += 0.25
            if g_stance != 'unrelated':
                score += 0.50
        if g_stance in RELATED and t_stance in RELATED:
            score += 0.25

        cm[LABELS.index(g_stance)][LABELS.index(t_stance)] += 1

    return score, cm

In [None]:
def print_confusion_matrix(cm):
    lines = []
    header = "|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format('', *LABELS)
    line_len = len(header)
    lines.append("-"*line_len)
    lines.append(header)
    lines.append("-"*line_len)

    hit = 0
    total = 0
    for i, row in enumerate(cm):
        hit += row[i]
        total += sum(row)
        lines.append("|{:^11}|{:^11}|{:^11}|{:^11}|{:^11}|".format(LABELS[i],
                                                                   *row))
        lines.append("-"*line_len)
    print('\n'.join(lines))

In [None]:
def report_score(actual,predicted):
    score,cm = score_submission(actual,predicted)
    best_score, _ = score_submission(actual,actual)

    print_confusion_matrix(cm)
    print("Score: " +str(score) + " out of " + str(best_score) + "\t("+str(score*100/best_score) + "%)")
    return score*100/best_score

In [None]:
prediction = process_model_output(model_outputs, e2l)
targets = test['labels'].map(e2l).tolist()

In [None]:
report_score(prediction, targets)

## Generate Submission

In [None]:
def save_submission(prediction: list):
    '''Helper function to prepare prediction to submission format.'''
    
    submission_df = read_data(path, 'test')
    submission_df = submission_df[['Headline', 'Body ID']]
    submission_df['Stance'] = prediction
    
    submission_df.to_csv('answer.csv', index=False, encoding='utf-8')

In [None]:
save_submission(prediction)