In [1]:
!pip install tensorflow
!pip install tensorflow-text
!pip install transformers

Collecting tensorflow-text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 3.9 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.3
Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.2 MB/s 

In [2]:
# Import the necessary libraries
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras import layers
from tensorflow.keras.models import Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [3]:
# Get the training data
df = pd.read_csv("sts-train.csv", sep ="\t", error_bad_lines=False, names = ["Genre", "File", "Years", "Train", "Similarity", "Sentence 1", "Sentence 2"])

df.head()

Unnamed: 0,Genre,File,Years,Train,Similarity,Sentence 1,Sentence 2
0,main-captions,MSRvid,2012test,1,5.0,A plane is taking off.,An air plane is taking off.
1,main-captions,MSRvid,2012test,4,3.8,A man is playing a large flute.,A man is playing a flute.
2,main-captions,MSRvid,2012test,5,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,main-captions,MSRvid,2012test,6,2.6,Three men are playing chess.,Two men are playing chess.
4,main-captions,MSRvid,2012test,9,4.25,A man is playing the cello.,A man seated is playing the cello.


In [5]:
df.drop(['Genre', "File", "Years", "Train"], axis=1, inplace = True)
df_features = df.copy()
df_label = df.copy()
df_features.drop(['Similarity'], axis=1, inplace = True)
df_label.drop(['Sentence 1', "Sentence 2"], axis=1, inplace = True)

train, test, train_labels, test_labels = train_test_split(df_features, df_label, test_size=0.33)

train.head()



Unnamed: 0,Sentence 1,Sentence 2
1702,a brown dog running through the dirty muddy grass,The large brown dog is running outside in the ...
5519,Syrian Rebel Groups Battle Each Other in North,Syrian rebel groups battle each other in north
5235,Winter Storm Ion: State-By-State...,Winter Storm Janus: State-by-State...
3596,7 police officers were killed and 4 officers w...,during the clash 7 seven police officers were ...
4041,Osama Bin Laden Wives to Be Jailed,Osama Bin Laden movie to be filmed in India


In [6]:
# Use BERT for sentence embedding, a 1x768 vector which can be used as input for a Neural network

bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

# Function which returns the embedded vectors for a the sentence pair
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

# Test
get_sentence_embeding(["A plane is taking off", "An air plane is taking off"])




<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.95263135, -0.57028383, -0.82512164, ..., -0.44244722,
        -0.75299037,  0.9545006 ],
       [-0.94485545, -0.6103043 , -0.86615545, ..., -0.4986304 ,
        -0.74868244,  0.9514269 ]], dtype=float32)>

In [7]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
x = tf.keras.layers.Dense(1, activation='softmax', name="output")(outputs['pooled_output'])

# Construct the model
model = tf.keras.Model(inputs=[text_input], outputs = [x])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [None]:

# Compile and train the model

model.compile(optimizer = "adam", loss='binary_crossentropy', metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy')])
model.fit(train, train_labels, epochs=10)

