In [38]:
!pip install transformers



In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Load and preprocess data

In [40]:
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers

In [41]:
df_train = pd.read_csv('./drive/MyDrive/Mini-projects/MSRP/train.tsv', sep='\t', error_bad_lines=False)
df_test = pd.read_csv('./drive/MyDrive/Mini-projects/MSRP/test.tsv', sep='\t', error_bad_lines=False)
df_val = pd.read_csv('./drive/MyDrive/Mini-projects/MSRP/dev.tsv', sep='\t', error_bad_lines=False)



  df_train = pd.read_csv('./drive/MyDrive/Mini-projects/MSRP/train.tsv', sep='\t', error_bad_lines=False)
Skipping line 102: expected 5 fields, saw 6
Skipping line 656: expected 5 fields, saw 6
Skipping line 867: expected 5 fields, saw 6
Skipping line 880: expected 5 fields, saw 6
Skipping line 980: expected 5 fields, saw 6
Skipping line 1439: expected 5 fields, saw 6
Skipping line 1473: expected 5 fields, saw 6
Skipping line 1822: expected 5 fields, saw 6
Skipping line 1952: expected 5 fields, saw 6
Skipping line 2009: expected 5 fields, saw 6
Skipping line 2230: expected 5 fields, saw 6
Skipping line 2506: expected 5 fields, saw 6
Skipping line 2523: expected 5 fields, saw 6
Skipping line 2809: expected 5 fields, saw 6
Skipping line 2887: expected 5 fields, saw 6
Skipping line 2920: expected 5 fields, saw 6
Skipping line 2944: expected 5 fields, saw 6
Skipping line 3241: expected 5 fields, saw 6
Skipping line 3358: expected 5 fields, saw 6
Skipping line 3459: expected 5 fields, saw

In [42]:
df_train

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...
...,...,...,...,...,...
3453,1,1466168,1466246,"During the flight, engineers misjudged the ext...","During the flight, engineers underestimated th..."
3454,0,2245085,2245118,The Web site is registered to Parson under his...,The t33kid.com site is registered to Parson at...
3455,1,3237867,3237902,"The woman, Mary Kathryn Miller, 55, was arrest...","Mary Kathryn Miller, 55, of 27 Devon Road, Dar..."
3456,0,2194711,2194792,The Hubble Space Telescope's newest picture of...,The pictures were taken late Tuesday and early...


In [43]:
print(f"Total train samples : {df_train.shape[0]}")
print(f"Total validation samples: {df_val.shape[0]}")
print(f"Total test samples: {df_test.shape[0]}")

Total train samples : 3458
Total validation samples: 480
Total test samples: 1639


In [44]:
print("Train Target Distribution")
print(df_train['Quality'].value_counts(1))

print("Train Target Distribution")
print(df_val['Quality'].value_counts(1))

print("Train Target Distribution")
print(df_test['Quality'].value_counts(1))

Train Target Distribution
1    0.673511
0    0.326489
Name: Quality, dtype: float64
Train Target Distribution
1    0.691667
0    0.308333
Name: Quality, dtype: float64
Train Target Distribution
1    0.663819
0    0.336181
Name: Quality, dtype: float64


In [45]:
y_train = tf.keras.utils.to_categorical(df_train.Quality, num_classes=2)

y_val = tf.keras.utils.to_categorical(df_val.Quality, num_classes=2)

y_test = tf.keras.utils.to_categorical(df_test.Quality, num_classes=2)

In [46]:
print(y_train)

[[0. 1.]
 [1. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [0. 1.]]


In [47]:
max_length = 256  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
batch_size = 32

In [48]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            "albert-base-v2", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

#Build model

In [49]:
from transformers import TFAlbertForSequenceClassification
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

# Load BERT model and tokenizer
model_name = "albert-base-v2"
model = TFAlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)
#tokenizer = BertTokenizer.from_pretrained(model_name)

# Set up optimizer
learning_rate = 2e-05
optimizer = Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

# Set up learning rate scheduler
num_epochs = 4
# num_train_steps = len(df_train['Quality']) * num_epochs
# decay_schedule = PolynomialDecay(initial_learning_rate=learning_rate, end_learning_rate=0, decay_steps=num_train_steps)

# Compile the model
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_albert_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11683584  
                                                                 
 dropout_14 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 11685122 (44.58 MB)
Trainable params: 11685122 (44.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [50]:
train_data = BertSemanticDataGenerator(
    df_train[["#1 String", "#2 String"]].values.astype("str"),
    y_train,
    batch_size=32,
    shuffle=True,
)
valid_data = BertSemanticDataGenerator(
    df_val[["#1 String", "#2 String"]].values.astype("str"),
    y_val,
    batch_size=8,
    shuffle=False,
)

In [51]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=4,
    use_multiprocessing=True,
    workers=-1,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/4






Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 2/4
Epoch 3/4
Epoch 4/4


#Evaluate

In [52]:
test_data = BertSemanticDataGenerator(
    df_test[["#1 String", "#2 String"]].values.astype("str"),
    y_test,
    batch_size=16,
    shuffle=False,
)
model.evaluate(test_data, verbose=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




[0.4436703622341156, 0.8247548937797546]

In [53]:
X_test = BertSemanticDataGenerator(
    df_test[["#1 String", "#2 String"]].values.astype("str"),
    labels=None, batch_size=len(df_test), shuffle=False, include_targets=False,
    )

from sklearn.metrics import f1_score

y_pred = []

# Loop through the rows of X_test
for i in range(len(X_test)):
    # Make a prediction for the i-th row
    prediction = model.predict(X_test[i])  # Assuming X_test is a DataFrame
    y_pred.append(prediction)
print(y_pred)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[TFSequenceClassifierOutput(loss=None, logits=array([[-0.26103374,  0.9959971 ],
       [-0.12875836,  0.9697072 ],
       [-0.42987508,  1.0616171 ],
       ...,
       [-0.27295467,  1.0452865 ],
       [-0.20854439,  1.0621164 ],
       [-0.07836369,  0.93327206]], dtype=float32), hidden_states=None, attentions=None)]


In [54]:
threshold = 0.5

# Convert y_pred to a NumPy array


# Apply the threshold to obtain binary labels
thresholded_array = (y_pred[0][0] >= threshold).astype(int)

In [55]:
f1 = f1_score(y_test, thresholded_array.reshape(-1, 2), average='weighted')

print("F1 Score:", f1)

F1 Score: 0.8267974500162502
