In [3]:
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install transformers

[0m

In [1]:
import pandas as pd
import numpy as np

file = "/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(file, encoding='ISO-8859-1', usecols=[0,5], header=None)\
        .sample(frac=0.3, random_state=42)

df.columns = ['label','sentence']
df.label = df.label.apply(lambda x: np.float64(1) if x==4 else np.float64(x))

print("df.shape =",df.shape)
print(f"label distribution :\n{df.label.value_counts()}")
print(df.head())

df.shape = (480000, 2)
label distribution :
1.0    240639
0.0    239361
Name: label, dtype: int64
        label                                           sentence
541200    0.0             @chrishasboobs AHHH I HOPE YOUR OK!!! 
750       0.0  @misstoriblack cool , i have no tweet apps  fo...
766711    0.0  @TiannaChaos i know  just family drama. its la...
285055    0.0  School email won't open  and I have geography ...
705995    0.0                             upper airways problem 


In [2]:
from transformers import AutoTokenizer, TFAutoModel

checkpoint = "google/mobilebert-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModel.from_pretrained(checkpoint, output_hidden_states=True)
clear_output()

Downloading (…)lve/main/config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)"tf_model.h5";:   0%|          | 0.00/164M [00:00<?, ?B/s]

Some layers from the model checkpoint at google/mobilebert-uncased were not used when initializing TFMobileBertModel: ['predictions___cls', 'seq_relationship___cls']
- This IS expected if you are initializing TFMobileBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFMobileBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFMobileBertModel were initialized from the model checkpoint at google/mobilebert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMobileBertModel for predictions without further training.


NameError: name 'clear_output' is not defined

In [4]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

sequences, test_val_sequences = train_test_split(df, test_size=0.3,
                                             stratify=df.label, random_state=44)
val_sequences, test_sequences = train_test_split(test_val_sequences, test_size=0.7,
                                             stratify=test_val_sequences.label, random_state=44)
dataset = {
    "TRAIN": sequences['sentence'].values.tolist(),
    "TEST": test_sequences['sentence'].values.tolist(),
    "VAL": val_sequences['sentence'].values.tolist()
}
targets = {
    "TRAIN": sequences['label'].values.tolist(),
    "TEST": test_sequences['label'].values.tolist(),
    "VAL": val_sequences['label'].values.tolist()
}

In [5]:
def tokenization(data, **kwargs):
    return tokenizer(data, 
                   padding=kwargs.get('padding','longest'), 
                   max_length=kwargs.get('max_length',55),
                   truncation=True, 
                   return_tensors="tf")

In [6]:
def get_model(**kwargs):
    global model
    max_seq_length = kwargs.get('max_seq_length',55)

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained('google/mobilebert-uncased')
    

    input_ids = tf.keras.Input(shape=(max_seq_length,), dtype='int32', name='input_ids')
    attention_mask = tf.keras.Input(shape=(max_seq_length,), dtype='int32', name='attention_mask')

    # Tokenize inputs and pass them through the MobileBERT model
    inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
    outputs = model(inputs)
    pooler_output = outputs['pooler_output']

    # Model Head
    h1 = tf.keras.layers.Dense(128, activation='relu')(pooler_output)
    dropout = tf.keras.layers.Dropout(0.2)(h1)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

    # Create and compile the new model
    new_model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
    metrics = [tf.keras.metrics.BinaryAccuracy()]
    new_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    return new_model

In [7]:
from sklearn.metrics import classification_report

def test_result(model):    
    test_inputs = tokenization(dataset["TEST"])
    result_proba = model.predict([test_inputs.input_ids, test_inputs.attention_mask])
    result = [1 if x>0.5 else 0 for x in result_proba.ravel()]
    print(classification_report(targets['TEST'],result))
    return result_proba, result

In [8]:
new_model = get_model()
#result_proba_before, result_before = test_result(new_model)

In [9]:
inputs = tokenization(dataset['TRAIN'])
train_targets = tf.convert_to_tensor(targets['TRAIN'])

val_inputs = tokenization(dataset['VAL'])
val_targets = tf.convert_to_tensor(targets['VAL'])

# Train the model
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', mode='max', patience=5)

new_model.fit([inputs.input_ids, inputs.attention_mask], train_targets, 
              validation_data = ([val_inputs.input_ids, val_inputs.attention_mask], val_targets),
              epochs=100, batch_size=128, callbacks=[early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.callbacks.History at 0x7f2b191fb390>

In [12]:
result_proba_after, result_after = test_result(new_model)

              precision    recall  f1-score   support

         0.0       0.83      0.86      0.84     50266
         1.0       0.85      0.83      0.84     50534

    accuracy                           0.84    100800
   macro avg       0.84      0.84      0.84    100800
weighted avg       0.84      0.84      0.84    100800



In [11]:
# SAVE MODEL WEIGHTS
new_model.save_weights(f'sentiment_weights_MobileBert_final.h5')
!zip -r sentiment_weights_MobileBert_final.zip sentiment_weights_MobileBert_final.h5
from IPython.display import FileLink
FileLink(r'sentiment_weights_MobileBert_final.zip')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: sentiment_weights_MobileBert_final.h5 (deflated 8%)
