In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.nn import CrossEntropyLoss, MSELoss

from tqdm import tqdm_notebook, trange
import os
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule

from multiprocessing import Pool, cpu_count
from tools import *
import convert_examples_to_features

In [2]:
#Preprocessing data: extracting the text from the pos and neg files then giving the text the correct label
#Then combining the pos and neg data together and splitting them into training and testing sets.
#Then replacing \n and \n with just spaces

#Preprocessing techniques will vary for different datasets

pos = pd.read_csv('../freeletics/pos1.csv')
neg = pd.read_csv('../freeletics/neg1.csv')

pos_text = pos.values[:,[2]]
neg_text = neg.values[:,[2]]

y_pos = np.ones((len(pos_text),1)).astype(int)
y_neg = np.zeros((len(neg_text),1)).astype(int)

print(str(len(pos_text))+' '+str(len(neg_text)))

x_data = np.vstack((pos_text,neg_text))
y_data = np.vstack((y_pos,y_neg))

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25)

train_df = pd.DataFrame(np.hstack((y_train,X_train)))
test_df = pd.DataFrame(np.hstack((y_test,X_test)))

train_df_bert = pd.DataFrame({
    'id':range(len(train_df)),
    'label':train_df[0],
    'alpha':['a']*train_df.shape[0],
    'text': train_df[1].replace(r'\n', ' ', regex=True).replace(r'\r',' ',regex=True)
})

dev_df_bert = pd.DataFrame({
    'id':range(len(test_df)),
    'label':test_df[0],
    'alpha':['a']*test_df.shape[0],
    'text': test_df[1].replace(r'\n', ' ', regex=True).replace(r'\r',' ',regex=True)
})

train_df_bert.to_csv('../freeletics/train.tsv', sep='\t', index=False, header=False)
dev_df_bert.to_csv('../freeletics/dev.tsv', sep='\t', index=False, header=False)

2140 879


In [3]:
DATA_DIR = "../freeletics/"

# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
#These are models that can be loaded from the Internet
BERT_MODEL = 'bert-base-cased'

# The name of the task to train
TASK_NAME = 'freeletics'

# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'outputs/{TASK_NAME}/'

# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'

# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = 'cache/'

# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 128

TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"

In [4]:
#Text data needs to be converted into features that BERT is capable of using. The BinaryClassificationProcessor is
#one of the many processors that BERT can use. Some others include the ColaProcessor, MnliProcessor, and MrpcProcessor
#which are all used for different tasks like question answering and sentence pairing

processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples('../freeletics/')
train_examples_len = len(train_examples)

label_list = processor.get_labels() # only 0 or 1 for binary classification
num_labels = len(label_list)

num_train_optimization_steps = int(
    train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

In [None]:
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

In [7]:
#loading the pretrained tokenizer from the BERT package
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, 
                                  tokenizer, OUTPUT_MODE) for example in train_examples]

In [9]:
#converting training text data into BERT features

process_count = cpu_count() - 1
if __name__ ==  '__main__':
    print(f'Preparing to convert {train_examples_len} examples..')
    print(f'Spawning {process_count} processes..')
    with Pool(process_count) as p:
        train_features = list(tqdm_notebook(p.imap(convert_examples_to_features.convert_example_to_feature, 
                                                   train_examples_for_processing), total=train_examples_len))

Preparing to convert 2264 examples..
Spawning 7 processes..


HBox(children=(IntProgress(value=0, max=2264), HTML(value='')))




In [10]:
#creating a model that has not yet been fine-tuned
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)

model.to('cpu')

In [12]:
#Preparing model to be fine-tuned. This includes creating an optimizer that can help to fine-tune the model in the
#most efficient way possible. 

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP_PROPORTION,
                     t_total=num_train_optimization_steps)

global_step = 0
nb_tr_steps = 0
tr_loss = 0

logger.info("***** Running training *****")
logger.info("  Num examples = %d", train_examples_len)
logger.info("  Batch size = %d", TRAIN_BATCH_SIZE)
logger.info("  Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

In [17]:
#fine-tuning the model using the training data set. This could take a long time. It usually took 1-2 hours for my
#cpu to process it all. However, you can probably use a faster system if you have one

model.train()
device = 'cpu'
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        logits = model(input_ids, segment_ids, input_mask, labels=None)

        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), label_ids.view(-1))

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        print("\r%f" % loss, end='')
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, description='Iteration', max=95, style=ProgressStyle(description_width='in…

0.920260

Epoch: 100%|██████████| 1/1 [33:41<00:00, 2021.59s/it]







In [18]:
#Saving the model is important since it takes so long to train. The way you should save it is by creating a folder in
#the "outputs" folder for every model you train. Then save your model in that folder, and it should save as 3 files:
#config.json, pytorch_model.bin, and vocab.txt. Then you should create a tar.gz file using the terminal code:
#"tar cvzf model_name.tar.gz config.json pytorch_model.bin" and then you should put the tar.gz file into the cache
#folder. This is because the BERT eval code will be loading the fine-tuned model from the cache folder so that it can
#run it with the testing data.

model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(OUTPUT_DIR)