### Import dependencies

In [21]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [22]:
# !pip install bert-for-tf2
# !pip install sentencepiece

In [23]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print('gpu', gpu)
    tf.config.experimental.set_memory_growth(gpu, True)
    print('memory growth:' , tf.config.experimental.get_memory_growth(gpu))

gpu PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
memory growth: True


In [24]:
#!pip install tensorflow_hub
import tensorflow_hub as hub

In [25]:
import bert

## Data preprocessing
### Loading files

In [26]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("/home/dawidkubicki/Datasets/sentiment_data/data/train.csv",
                   header=None,
                   names=cols,
                   engine="python",
                   encoding="latin1")

In [27]:
data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [28]:
data.drop(["id", "date", "query", "user"],
         axis=1,
         inplace=True)

In [29]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


### Cleaning

In [30]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [31]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [32]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization
#### We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size)

In [35]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
                           trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [36]:
tokenizer.tokenize("My dog love strawberries.")

['my', 'dog', 'love', 'straw', '##berries', '.']

In [38]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog love strawberries."))

[2026, 3899, 2293, 13137, 20968, 1012]

In [39]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [None]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Dataset creation
#### We will create padded batches (so we pad sentences for each batch independetly), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.