<a href="https://colab.research.google.com/github/deepakjangir15/duplicate_CQA_detection/blob/main/Tensorflow_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Dropout, TextVectorization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import random

In [None]:
data = pd.read_pickle('/content/drive/MyDrive/unzipped/checkpoint6.pkl')

In [None]:
max_vocab_length = 50000
max_length = 20

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length = max_length)

In [None]:
q1_train = data["question1"]
q2_train = data["question2"]

In [None]:
# Use train_test_split to split training data into training and validation sets
train_q1,val_q1,train_q2, val_q2, train_labels, val_labels = train_test_split(q1_train.to_numpy(),
                                                                              q2_train.to_numpy(),
                                                                            data["is_duplicate"].to_numpy(),
                                                                            test_size=0.15, # dedicate 15% of samples to validation set
                                                                            random_state=101) # random state for reproducibility\

In [None]:
train_q1.shape,val_q1.shape,train_labels.shape

((338747,), (59779,), (338747,))

In [None]:
# Combining the questions into a tf.data dataset

train_questions_data = tf.data.Dataset.from_tensor_slices((train_q1,train_q2))
train_labels_data = tf.data.Dataset.from_tensor_slices(train_labels)
train_questions_dataset = tf.data.Dataset.zip((train_questions_data, train_labels_data))

train_questions_dataset = train_questions_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


val_questions_data = tf.data.Dataset.from_tensor_slices((val_q1,val_q2))
val_labels_data = tf.data.Dataset.from_tensor_slices(val_labels)
val_questions_dataset = tf.data.Dataset.zip((val_questions_data, val_labels_data))

val_questions_dataset = val_questions_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
for x in train_questions_data:
  print(x)
  break

(<tf.Tensor: shape=(), dtype=string, numpy=b'Facebook fbml add fan button to page facebook facebook-fbml try figure button acebook page right next company name see do page show following screenshot enter image description add application though find great deal information code require actual button place button page anyone point right direction thanks'>, <tf.Tensor: shape=(), dtype=string, numpy=b'How to develop ajax web applications in Scala? ajax oop scala lift web-frameworks look object orient framework develop application cala ideally would like something like without mix client code cala server code short example cala could create vertical layout label list code label abel nothing select list item1 item2 election value label text value page ontent ertical ayout label '>)


In [None]:
# Average number of tokens present in the training data - Question 1

round(sum([len(i.split()) for i in train_q1])/len(train_q1))

20

In [None]:
# Average number of tokens present in the training data - Question 2

round(sum([len(i.split()) for i in train_q2])/len(train_q2))

20

In [None]:
max_vocab_length = 50000
max_length = 20

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length = max_length)

# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_questions_data)

In [None]:
# Create sample sentence and tokenize it
sample_sentence = "Where are the seven wonders of the world?"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[  153,    14,     3,  5669, 38799,    12,     3,   238,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0]])>

In [None]:
random_sentence = random.choice(train_q1)
print(f'Original text:\n{random_sentence}\
      \n\nVectorized text:')
text_vectorizer(random_sentence)

Original text:
In iOS, how should I be obtaining the screen's height and width, including width being greater than height in landscape mode? iphone ios objective-c ipad background-image function render background code height creen main creen bound size height width creen main creen bound size width code height width retina device landscape mode height 1024 width image display rotation portrait image turn device side image neatly fill whole screen asis background display horizontall      

Vectorized text:


<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([    9,   356,     8,    64,    10,    55, 12003,     3, 13096,
         307,    13,   310,  3025,   310,   437,  4900,   273,   307,
           9,  3077])>

In [None]:
# Get all the unique words present in the vocabulary
vocab_words = text_vectorizer.get_vocabulary()
top_5_words = vocab_words[:5]
bottom_5_words = vocab_words[-5:]

print(f"Number of words in vocab: {len(vocab_words)}")
print(f"Top 5 most common words: {top_5_words}") 
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 50000
Top 5 most common words: ['', '[UNK]', 'code', 'the', 'what']
Bottom 5 least common words: ['skd', 'skates', 'sk', 'sizers', 'sixdigit']


In [None]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             embeddings_initializer='uniform',
                             input_length=max_length,
                             name='embedding1')

embedding

<keras.layers.core.embedding.Embedding at 0x7f3d8c557700>

In [None]:
random_sentence = random.choice(train_q1)
print(f'Original text:\n{random_sentence}\
      \n\nEmbedded text:')

sample_embedded_text = embedding(text_vectorizer([random_sentence]))
sample_embedded_text

Original text:
how do i translate this english sentence into german      

Embedded text:


<tf.Tensor: shape=(1, 20, 128), dtype=float32, numpy=
array([[[ 0.01545889, -0.03671843,  0.01988218, ...,  0.03288582,
         -0.02877569, -0.00503626],
        [ 0.0130278 ,  0.02627177,  0.04028996, ...,  0.03127203,
          0.03451768, -0.03823571],
        [-0.01714529, -0.0376454 ,  0.01133369, ..., -0.03178798,
          0.04614197, -0.03610648],
        ...,
        [ 0.00931767,  0.03379751,  0.03879407, ..., -0.04418159,
          0.04225722, -0.01176969],
        [ 0.00931767,  0.03379751,  0.03879407, ..., -0.04418159,
          0.04225722, -0.01176969],
        [ 0.00931767,  0.03379751,  0.03879407, ..., -0.04418159,
          0.04225722, -0.01176969]]], dtype=float32)>

# Model 1 - Simple Dense Model

In [None]:
token_inputs_q1 = layers.Input(shape=[], dtype=tf.string, name='token_input_q1')
token_embeddings_q1 = text_vectorizer(token_inputs_q1)
x = embedding(token_embeddings_q1)
x = layers.GlobalAveragePooling1D()(x)
token_output_q1 = layers.Dense(128, activation='relu')(x)
token_model_q1 = tf.keras.Model(inputs=token_inputs_q1,outputs=token_output_q1)

token_inputs_q2 = layers.Input(shape=[], dtype=tf.string, name='token_input_q2')
token_embeddings_q2 = text_vectorizer(token_inputs_q2)
x = embedding(token_embeddings_q2)
x = layers.GlobalAveragePooling1D()(x)
token_output_q2 = layers.Dense(128, activation='relu')(x)
token_model_q2 = tf.keras.Model(inputs=token_inputs_q2,outputs=token_output_q2)

In [None]:
token_questions_concat = layers.Concatenate(name='token_questions_cat')([token_model_q1.output,
                                                                         token_model_q2.output])

combined_dropout = layers.Dropout(0.5)(token_questions_concat)
combined_dense = layers.Dense(64, activation='relu')(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation='sigmoid')(final_dropout)

model_1 = tf.keras.Model(inputs=[token_model_q1.input,token_model_q2.input],
                         outputs=output_layer,
                         name='model_1_token')


In [None]:
model_1.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_1_history = model_1.fit(train_questions_dataset,
                              epochs=5,
                              validation_data=val_questions_dataset,
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
embed_weights = model_1.get_layer('embedding1').get_weights()[0]
print(embed_weights.shape)

(50000, 128)


In [None]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
# converting _TensorSliceDataset to numpy array

labels = tf.compat.v1.data.make_one_shot_iterator(val_labels_data.batch(len(val_labels_data))).get_next()

In [None]:
model_1_pred_probs = model_1.predict(val_questions_dataset)
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))

model_1_results = calculate_results(y_true=labels,
                                    y_pred=model_1_preds)

model_1_results



{'accuracy': 79.71026614697469,
 'precision': 0.7987207787386681,
 'recall': 0.797102661469747,
 'f1': 0.7968363529771887}

# Model 2 - LSTM

In [None]:
model_2_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer='uniform',
                                     input_length=max_length,
                                     name='embedding_2')


token_inputs_q1 = layers.Input(shape=[], dtype=tf.string, name='token_input_q1')
token_embeddings_q1 = text_vectorizer(token_inputs_q1)
x = model_2_embedding(token_embeddings_q1)
x = layers.LSTM(64)(x)
token_output_q1 = layers.Dense(128, activation='relu')(x)
token_model_q1 = tf.keras.Model(inputs=token_inputs_q1,outputs=token_output_q1)

token_inputs_q2 = layers.Input(shape=[], dtype=tf.string, name='token_input_q2')
token_embeddings_q2 = text_vectorizer(token_inputs_q2)
x = model_2_embedding(token_embeddings_q2)
x = layers.LSTM(64)(x)
token_output_q2 = layers.Dense(128, activation='relu')(x)
token_model_q2 = tf.keras.Model(inputs=token_inputs_q2,outputs=token_output_q2)

token_questions_concat = layers.Concatenate(name='token_questions_cat')([token_model_q1.output,
                                                                         token_model_q2.output])

combined_dropout = layers.Dropout(0.5)(token_questions_concat)
combined_dense = layers.Dense(64, activation='relu')(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation='sigmoid')(final_dropout)

model_2 = tf.keras.Model(inputs=[token_model_q1.input,token_model_q2.input],
                         outputs=output_layer,
                         name='model_2_token')

model_2.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_2_history = model_2.fit(train_questions_dataset,
                              epochs=5,
                              validation_data=val_questions_dataset,
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_2.summary()

Model: "model_1_token"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_input_q1 (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 token_input_q2 (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 text_vectorization_2 (TextVect  (None, 20)          0           ['token_input_q1[0][0]',         
 orization)                                                       'token_input_q2[0][0]']         
                                                                                                  
 embedding_2 (Embedding)        (None, 20, 128)      6400000     ['text_vectorization_

# Model 3 - GRU

In [None]:
model_3_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer='uniform',
                                     input_length=max_length,
                                     name='embedding_3')


token_inputs_q1 = layers.Input(shape=[], dtype=tf.string, name='token_input_q1')
token_embeddings_q1 = text_vectorizer(token_inputs_q1)
x = model_3_embedding(token_embeddings_q1)
x = layers.GRU(64)(x)
token_output_q1 = layers.Dense(128, activation='relu')(x)
token_model_q1 = tf.keras.Model(inputs=token_inputs_q1,outputs=token_output_q1)

token_inputs_q2 = layers.Input(shape=[], dtype=tf.string, name='token_input_q2')
token_embeddings_q2 = text_vectorizer(token_inputs_q2)
x = model_3_embedding(token_embeddings_q2)
x = layers.GRU(64)(x)
token_output_q2 = layers.Dense(128, activation='relu')(x)
token_model_q2 = tf.keras.Model(inputs=token_inputs_q2,outputs=token_output_q2)

token_questions_concat = layers.Concatenate(name='token_questions_cat')([token_model_q1.output,
                                                                         token_model_q2.output])

combined_dropout = layers.Dropout(0.5)(token_questions_concat)
combined_dense = layers.Dense(64, activation='relu')(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation='sigmoid')(final_dropout)

model_3 = tf.keras.Model(inputs=[token_model_q1.input,token_model_q2.input],
                         outputs=output_layer,
                         name='model_3_token')

model_3.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_3_history = model_3.fit(train_questions_dataset,
                              epochs=5,
                              validation_data=val_questions_dataset,
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_3.summary()

# Model 4 - Bidirectional LSTM

In [None]:
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer='uniform',
                                     input_length=max_length,
                                     name='embedding_4')


token_inputs_q1 = layers.Input(shape=[], dtype=tf.string, name='token_input_q1')
token_embeddings_q1 = text_vectorizer(token_inputs_q1)
x = model_4_embedding(token_embeddings_q1)
x = layers.Bidirectional(layers.LSTM(64))(x)
token_output_q1 = layers.Dense(128, activation='relu')(x)
token_model_q1 = tf.keras.Model(inputs=token_inputs_q1,outputs=token_output_q1)

token_inputs_q2 = layers.Input(shape=[], dtype=tf.string, name='token_input_q2')
token_embeddings_q2 = text_vectorizer(token_inputs_q2)
x = model_4_embedding(token_embeddings_q2)
x = layers.Bidirectional(layers.LSTM(64))(x)
token_output_q2 = layers.Dense(128, activation='relu')(x)
token_model_q2 = tf.keras.Model(inputs=token_inputs_q2,outputs=token_output_q2)

token_questions_concat = layers.Concatenate(name='token_questions_cat')([token_model_q1.output,
                                                                         token_model_q2.output])

combined_dropout = layers.Dropout(0.5)(token_questions_concat)
combined_dense = layers.Dense(64, activation='relu')(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation='sigmoid')(final_dropout)

model_4 = tf.keras.Model(inputs=[token_model_q1.input,token_model_q2.input],
                         outputs=output_layer,
                         name='model_4_token')

model_4.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_4_history = model_4.fit(train_questions_dataset,
                              epochs=5,
                              validation_data=val_questions_dataset,
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_4.summary()

Model: "model_4_token"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_input_q1 (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 token_input_q2 (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 text_vectorization_2 (TextVect  (None, 20)          0           ['token_input_q1[0][0]',         
 orization)                                                       'token_input_q2[0][0]']         
                                                                                                  
 embedding_4 (Embedding)        (None, 20, 128)      6400000     ['text_vectorization_

# Model 5 - Conv1D

In [None]:
model_5_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer='uniform',
                                     input_length=max_length,
                                     name='embedding_5')


token_inputs_q1 = layers.Input(shape=[], dtype=tf.string, name='token_input_q1')
token_embeddings_q1 = text_vectorizer(token_inputs_q1)
x = model_5_embedding(token_embeddings_q1)
x = layers.Conv1D(filters=32,kernel_size=5,activation='relu')(x)
x = layers.GlobalMaxPool1D()(x)
token_output_q1 = layers.Dense(128, activation='relu')(x)
token_model_q1 = tf.keras.Model(inputs=token_inputs_q1,outputs=token_output_q1)

token_inputs_q2 = layers.Input(shape=[], dtype=tf.string, name='token_input_q2')
token_embeddings_q2 = text_vectorizer(token_inputs_q2)
x = model_5_embedding(token_embeddings_q2)
x = layers.Conv1D(filters=32,kernel_size=5,activation='relu')(x)
x = layers.GlobalMaxPool1D()(x)
token_output_q2 = layers.Dense(128, activation='relu')(x)
token_model_q2 = tf.keras.Model(inputs=token_inputs_q2,outputs=token_output_q2)

token_questions_concat = layers.Concatenate(name='token_questions_cat')([token_model_q1.output,
                                                                         token_model_q2.output])

combined_dropout = layers.Dropout(0.5)(token_questions_concat)
combined_dense = layers.Dense(64, activation='relu')(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation='sigmoid')(final_dropout)

model_5 = tf.keras.Model(inputs=[token_model_q1.input,token_model_q2.input],
                         outputs=output_layer,
                         name='model_5_token')

model_5.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_5_history = model_5.fit(train_questions_dataset,
                              epochs=5,
                              validation_data=val_questions_dataset,
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_5.summary()

Model: "model_5_token"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_input_q1 (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 token_input_q2 (InputLayer)    [(None,)]            0           []                               
                                                                                                  
 text_vectorization_2 (TextVect  (None, 20)          0           ['token_input_q1[0][0]',         
 orization)                                                       'token_input_q2[0][0]']         
                                                                                                  
 embedding_5 (Embedding)        (None, 20, 128)      6400000     ['text_vectorization_

 # Model 6 - TF hub Sentence Encoder

In [None]:
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

question_encoder_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name='USE')

In [None]:
token_inputs_q1 = layers.Input(shape=[], dtype=tf.string, name='token_input_q1')
token_embeddings_q1 = question_encoder_layer(token_inputs_q1)
token_output_q1 = layers.Dense(128, activation='relu')(token_embeddings_q1)
token_model_q1 = tf.keras.Model(inputs=token_inputs_q1,
                             outputs=token_output_q1)

token_inputs_q2 = layers.Input(shape=[], dtype=tf.string, name='token_input_q2')
token_embeddings_q2 = question_encoder_layer(token_inputs_q2)
token_output_q2 = layers.Dense(128, activation='relu')(token_embeddings_q2)
token_model_q2 = tf.keras.Model(inputs=token_inputs_q2,
                             outputs=token_output_q2)

token_questions_concat = layers.Concatenate(name='token_questions_cat')([token_model_q1.output,
                                                                         token_model_q2.output])

combined_dropout = layers.Dropout(0.5)(token_questions_concat)
combined_dense = layers.Dense(64, activation='relu')(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(1, activation='sigmoid')(final_dropout)

model_6 = tf.keras.Model(inputs=[token_model_q1.input,token_model_q2.input],
                         outputs=output_layer,
                         name='model_6_token')

model_6.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_6_history = model_6.fit(train_questions_dataset,
                              epochs=5,
                              validation_data=val_questions_dataset,
                              )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_6.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model_6)

# Finding the most wrongly predicted questions

In [None]:
val_df = pd.DataFrame({'questions': val_questions,
                       'Is Duplicate?': val_labels,
                       'predicted': model_7_preds,
                       'pred_prob': tf.squeeze(model_7_pred_probs)})

val_df.head()

In [None]:
wron_predictions = val_df[val_df['Is Duplicate?'] != val_df['predicted']].sort_values('pred_prob', ascending=False)
wron_predictions.head(10)

In [None]:
for row in wron_predictions.head(10)[['questions','Is Duplicate?','predicted','pred_prob']].itertuples():
  _,question,target,pred,pred_prob = row
  print(f'{question} \nTarget: {target}\nPred: {pred} Pred Prob: {pred_prob}')

# Conclusion

From the overall analysis. It is seen that out of 10 most incorrect predictions made by our models, it is observed that 8 are out of the stack overflow dataset.