<a href="https://colab.research.google.com/github/sebastianSbg/Computational-Intelligence-Lab/blob/master/%E2%80%9CCIL_Project_pre_trained_BERT_ipynb%E2%80%9D_Copy_25_March.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To use with google drive. Kept here for reference.

In [1]:
!pip install transformers



In [2]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import re
import os

from sklearn.model_selection import train_test_split

import tensorflow as tf

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import TFTrainer, TFTrainingArguments

In [3]:
# Load pre-trained tokenizer and classification model

In [4]:
def load_tokenizer():
    # https://huggingface.co/transformers/model_doc/bert.html#berttokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    return tokenizer

In [5]:
def load_model():
    # https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification
    model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels = 2, 
        output_attentions = False, 
        output_hidden_states = False,
    )
    return model

In [6]:
# Load the dataset from Google Drive
data_path = '/content/drive/MyDrive/twitter-datasets'

for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/content/drive/MyDrive/twitter-datasets/train_pos_full.txt
/content/drive/MyDrive/twitter-datasets/train_neg_full.txt
/content/drive/MyDrive/twitter-datasets/test_data.txt
/content/drive/MyDrive/twitter-datasets/train_pos.txt
/content/drive/MyDrive/twitter-datasets/train_neg.txt
/content/drive/MyDrive/twitter-datasets/sample_submission.csv


In [7]:
with open('/content/drive/MyDrive/twitter-datasets/train_pos.txt', 'r') as fp:
    train_pos_sub = fp.readlines()

with open('/content/drive/MyDrive/twitter-datasets/train_neg.txt', 'r') as fp:
    train_neg_sub = fp.readlines()

with open('/content/drive/MyDrive/twitter-datasets/train_pos_full.txt', 'r') as fp:
    train_pos_full = fp.readlines()

with open('/content/drive/MyDrive/twitter-datasets/train_neg_full.txt', 'r') as fp:
    train_neg_full = fp.readlines()

with open('/content/drive/MyDrive/twitter-datasets/test_data.txt', 'r') as fp:
    test = fp.readlines()

In [8]:
# The number of entries in each file
len(train_pos_sub), len(train_neg_sub), len(train_pos_full), len(train_neg_full), len(test)

(100000, 100000, 1250000, 1250000, 10000)

In [9]:
# Use the subsets for this rough exploration
data_pos = train_pos_sub
data_neg = train_neg_sub

In [10]:
# Combine pos & neg, remove <xxxx>
text_list = data_pos + data_neg
label_list = [1]*len(data_pos) + [0]*len(data_neg)

data = {'text': text_list, 'label': label_list}
data = DataFrame(data)

data['text'] = data['text'].str.replace(r'<.*?>', '')
# <user> <url>

In [11]:
# The maximum, minimum number of words in tweets
# ... and empty entries
min_len = 999
max_len = 0
zero_len_idx = []
for idx, t in enumerate(data.text):
    t_len = len(t.split())
    if t_len == 0:
        zero_len_idx.append(idx)
    if t_len > max_len:
        max_len = t_len
    if t_len < min_len:
        min_len = t_len

min_len, max_len, len(zero_len_idx)

(0, 62, 17)

In [12]:
data = data.drop(zero_len_idx)

In [13]:
list(data.text[0:2])

[' i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15\n',
 "because your logic is so dumb , i won't even crop out your name or your photo . tsk . \n"]

In [14]:
tokenizer = load_tokenizer()

In [15]:
# An example of tokenization
sample = data.text[0]
print(sample)

encoded_sample = tokenizer(sample)
print(encoded_sample)

decoded_sample = tokenizer.decode(encoded_sample['input_ids'])
print(decoded_sample)

 i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me #believe 15

{'input_ids': [101, 1045, 14145, 2080, 6796, 3191, 2026, 5254, 2030, 2025, 1012, 2069, 6796, 1998, 2643, 4282, 2055, 2008, 1010, 2021, 1045, 3246, 2017, 2097, 3582, 2033, 1001, 2903, 2321, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] i dunno justin read my mention or not. only justin and god knows about that, but i hope you will follow me # believe 15 [SEP]


(Maybe we can keep the hashtag as a special information, it might be useful. But not sure.)

In [16]:
# Shuffle the dataset and split it to train- and validation- set
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

val_frac = 0.2
val_rows = int(len(data) * val_frac)
train = data[:-val_rows]
val = data[-val_rows:]

In [17]:
def encode_dataset(dataset, tokenizer, max_length=64):
    enc = tokenizer(
        list(dataset.text), 
        add_special_tokens=True, 
        truncation=True, 
        padding=True, 
        max_length=max_length
        )
    
    enc_dataset = tf.data.Dataset.from_tensor_slices((
            dict(enc),
            list(dataset.label)
            ))
    return enc_dataset

In [18]:
train_dataset = encode_dataset(train, tokenizer, max_len)
val_dataset = encode_dataset(val, tokenizer, max_len)

In [19]:
train_dataset = train_dataset.shuffle(1000).batch(32).repeat(2)
val_dataset = val_dataset.batch(32)

In [20]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

In [23]:
model = load_model()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [25]:
model.fit(train_dataset, epochs=3, validation_data=val_dataset)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f02008118d0>

In [26]:
model_saved_path = '/content/drive/MyDrive/CIL/model_BERT_pre_trained'
model.save(model_saved_path)





INFO:tensorflow:Assets written to: /content/drive/MyDrive/CIL/model_BERT_pre_trained/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/CIL/model_BERT_pre_trained/assets


In [None]:
# model = tf.keras.models.load_model(model_saved_path)

In [None]:
test_dataset = [re.sub(r'<.*?>', '', t) for t in test]

In [None]:
test_dataset = tokenizer_pretrained(
        list(test_dataset), 
        add_special_tokens=True, 
        truncation=True, 
        padding=True, 
        max_length=max_len
        )

In [None]:
tf_outputs = model.predict(test_dataset.input_ids)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()









In [None]:
label

array([0, 0, 1, ..., 0, 1, 0])

In [None]:
pred = np.where(label==0, -1, label)
pred

array([-1, -1,  1, ..., -1,  1, -1])

In [None]:
pred_dict = {'Id': range(1, 1+len(pred)), 'Prediction': pred}

In [None]:
pred_df = DataFrame(pred_dict)
pred_df.head()

Unnamed: 0,Id,Prediction
0,1,-1
1,2,-1
2,3,1
3,4,1
4,5,-1


In [None]:
pred_df.to_csv('./submission.csv', index=False)

Reference: 
- https://huggingface.co/transformers/custom_datasets.html#sequence-classification-with-imdb-reviews
- https://colab.research.google.com/drive/1CzEAyAByzXl5rZBYVBVeVcjT5fl3zTfb?usp=sharing
- https://www.tensorflow.org/tutorials/text/text_classification_rnn

We might use RNN as the other baseline