In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers

In [23]:
max_length = 128
batch_size = 16
epochs = 2
labels = [0,1] # vow

In [24]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [4]:
train = pd.read_csv('train.csv',sep='\t')
dev = pd.read_csv('dev.csv',sep='\t')
test = pd.read_csv('test.csv',sep='\t')

In [5]:
train

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977.0,"Amrozi accused his brother,"" whom he called """"...","Referring to him as only """"the witness"""""", Amr..."
1,0,2108705,2108831.0,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521.0,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648.0,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712.0,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...
...,...,...,...,...,...
3052,1,1466168,1466246.0,"During the flight, engineers misjudged the ext...","During the flight, engineers underestimated th..."
3053,0,2245085,2245118.0,The Web site is registered to Parson under his...,The t33kid.com site is registered to Parson at...
3054,1,3237867,3237902.0,"The woman, Mary Kathryn Miller, 55, was arrest...","Mary Kathryn Miller, 55, of 27 Devon Road, Dar..."
3055,0,2194711,2194792.0,The Hubble Space Telescope's newest picture of...,The pictures were taken late Tuesday and early...


In [6]:
print(f"Sentence1: {train.loc[1, '#1 String']}")
print(f"Sentence2: {train.loc[1, '#2 String']}")
print(f"Similarity: {train.loc[1, 'Quality']}")

Sentence1: Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.
Sentence2: Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.
Similarity: 0


In [7]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "microsoft/MiniLM-L12-H384-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [8]:
train["label"] = train["Quality"].apply(
    lambda x: 0 if x == 0 else 1 
)
y_train = tf.keras.utils.to_categorical(train.label, num_classes=3)

dev["label"] = dev["Quality"].apply(
    lambda x: 0 if x == 0 else 1
)
y_dev = tf.keras.utils.to_categorical(dev.label, num_classes=3)

test["label"] = test["Quality"].apply(
    lambda x: 0 if x == 0 else 1
)
y_test = tf.keras.utils.to_categorical(test.label, num_classes=3)

In [9]:
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from BERT tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    # Loading pretrained BERT model.
    bert_model = transformers.TFBertModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
    # Freeze the BERT model to reuse the pretrained features without modifying them.
    bert_model.trainable = False

    bert_output = bert_model.bert(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(3, activation="softmax")(dropout)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )


print(f"Strategy: {strategy}")
model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x000001CABD8C9310>
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_masks (InputLaye  [(None, 128)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids (InputLayer  [(None, 128)]                0         []                            
 )                                                                                      

In [10]:
train_data = BertSemanticDataGenerator(
    train[["#1 String", "#2 String"]].values.astype("str"),
    y_train,
    batch_size=batch_size,
    shuffle=True,
)
dev_data = BertSemanticDataGenerator(
    dev[["#1 String", "#2 String"]].values.astype("str"),
    y_dev,
    batch_size=batch_size,
    shuffle=False,
)
print(train[["#1 String", "#2 String"]])

                                              #1 String  \
0     Amrozi accused his brother," whom he called ""...   
1     Yucaipa owned Dominick's before selling the ch...   
2     They had published an advertisement on the Int...   
3     Around 0335 GMT, Tab shares were up 19 cents, ...   
4     The stock rose $2.11, or about 11 percent, to ...   
...                                                 ...   
3052  During the flight, engineers misjudged the ext...   
3053  The Web site is registered to Parson under his...   
3054  The woman, Mary Kathryn Miller, 55, was arrest...   
3055  The Hubble Space Telescope's newest picture of...   
3056  He is blocking them until the Air Force assign...   

                                              #2 String  
0     Referring to him as only ""the witness""", Amr...  
1     Yucaipa bought Dominick's in 1995 for $693 mil...  
2     On June 10, the ship's owners had published an...  
3     Tab shares jumped 20 cents, or 4.6%, to set a ...  
4

In [11]:
import os
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [12]:
with tf.device('/CPU:0'):
    history = model.fit(
        train_data,
        validation_data=dev_data,
        epochs=epochs,
        use_multiprocessing=True,
        workers=-1,
        callbacks=[cp_callback],
    )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2






Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Epoch 1: saving model to training_1\cp.ckpt
Epoch 2/2
Epoch 2: saving model to training_1\cp.ckpt


In [13]:
# Unfreeze the bert_model.
bert_model.trainable = True
# Recompile the model to make the change effective.
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_masks (InputLaye  [(None, 128)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                              

Total params: 33590659 (128.14 MB)
Trainable params: 33590659 (128.14 MB)
Non-trainable params: 0 (0.00 Byte)
__________________________________________________________________________________________________


In [14]:
with tf.device('/CPU:0'):
    history = model.fit(
        train_data,
        validation_data=dev_data,
        epochs=epochs,
        use_multiprocessing=True,
        workers=-1,
        callbacks=[cp_callback],
    )

Epoch 1/2
Epoch 1: saving model to training_1\cp.ckpt
Epoch 2/2
Epoch 2: saving model to training_1\cp.ckpt


In [None]:
model.save('my_model.keras')

In [None]:
# import sys
# sys.path.append('./DataPreProcessing.ipynb')
# model_new = tf.keras.models.load_model('C:/Users/ASUS/OneDrive/Documents/ML_Scientist/NaturalLanguageProcessing/MyminiprojectIn_NLP_at_school/my_model.keras')

In [16]:
with tf.device('/CPU:0'):
    test_data = BertSemanticDataGenerator(
        test[['#1 String', '#2 String']].values.astype("str"),
        y_test,
        batch_size=batch_size,
        shuffle=False,
    )
    model.evaluate(test_data, verbose=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




In [17]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

    proba = model.predict(test_data[0])[0]
    idx = np.argmax(proba)
    proba = f"{proba[idx]: .2f}%"
    pred = labels[idx]
    return pred, proba

In [18]:
def testAccuracy(test):
    countR = countW = 0;
    for i in range(500):
        s1 = test.loc[i,'#1 String']
        s2 = test.loc[i,'#2 String']
        res = check_similarity(s1,s2)
        if res[0] == test['Quality'][i]:
            countR+=1
        else:
            countW+=1
    with open("output.txt",'w') as f:
        print("number of right: " + str(countR), file=f)
        print("Number of wrong: " + str(countW), file = f)

In [None]:
 testAccuracy(test)

In [20]:
print(tf.version.VERSION)

2.13.0
