In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv("/content/output_dataset.csv")  # Replace "your_dataset.csv" with the actual filename
text1 = df['text1'].tolist()
text2 = df['text2'].tolist()
similarity_scores = df['similarity_score'].values  # Assuming the target column is named 'similarity_score'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [3]:
text1 = [preprocess_text(text) for text in text1]
text2 = [preprocess_text(text) for text in text2]

# Tokenization and padding
max_words = 10000
max_seq_length = 100  # Define your maximum sequence length
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text1 + text2)
sequences1 = tokenizer.texts_to_sequences(text1)
sequences2 = tokenizer.texts_to_sequences(text2)
X1 = pad_sequences(sequences1, maxlen=max_seq_length)
X2 = pad_sequences(sequences2, maxlen=max_seq_length)

In [6]:
input1 = Input(shape=(max_seq_length,))
input2 = Input(shape=(max_seq_length,))
embedding_layer = Embedding(input_dim=max_words, output_dim=128)
lstm_layer = LSTM(128)

embedded1 = embedding_layer(input1)
embedded2 = embedding_layer(input2)

output1 = lstm_layer(embedded1)
output2 = lstm_layer(embedded2)

merged_output = Concatenate()([output1, output2])
dense_layer = Dense(64, activation='relu')(merged_output)
dropout_layer = Dropout(0.2)(dense_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

siamese_model = Model(inputs=[input1, input2], outputs=output_layer)
siamese_model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=0.001), metrics=['mse'])

In [7]:
# Train the Siamese network
siamese_model.fit([X1_train, X2_train], y_train, batch_size=64, epochs=3, validation_data=([X1_test, X2_test], y_test))

# Evaluate the model
mse = siamese_model.evaluate([X1_test, X2_test], y_test)
print("Mean Squared Error:", mse)

# Predict similarity scores
predicted_similarity_scores = siamese_model.predict([X1_test, X2_test])

# Print or use the predicted similarity scores
print(predicted_similarity_scores)

# Save the predicted scores to a file
np.savetxt('predicted_scores.txt', predicted_similarity_scores)
from tensorflow.keras.models import load_model

Epoch 1/3
Epoch 2/3
Epoch 3/3
Mean Squared Error: [0.012392329052090645, 0.012392329052090645]
[[0.1315191 ]
 [0.13171788]
 [0.12886137]
 [0.12460367]
 [0.12862718]
 [0.13302852]
 [0.13233845]
 [0.12823024]
 [0.12911345]
 [0.12895036]
 [0.13793676]
 [0.12498121]
 [0.14120688]
 [0.1385339 ]
 [0.13922411]
 [0.12904719]
 [0.13972357]
 [0.1347349 ]
 [0.12789705]
 [0.13100061]
 [0.13793546]
 [0.12463143]
 [0.11908632]
 [0.1312542 ]
 [0.13254954]
 [0.13542788]
 [0.13073653]
 [0.12827288]
 [0.13322833]
 [0.12871464]
 [0.14106461]
 [0.12942328]
 [0.13211535]
 [0.13309014]
 [0.13004059]
 [0.12761243]
 [0.12899968]
 [0.13520421]
 [0.12850815]
 [0.12074081]
 [0.13545322]
 [0.1280843 ]
 [0.13512419]
 [0.13309063]
 [0.12292563]
 [0.1312196 ]
 [0.1383144 ]
 [0.14390397]
 [0.13483743]
 [0.12859939]
 [0.13955791]
 [0.12977266]
 [0.13811237]
 [0.13588002]
 [0.12701182]
 [0.13320252]
 [0.13776499]
 [0.12642725]
 [0.13126412]
 [0.14241934]
 [0.1310478 ]
 [0.1376933 ]
 [0.12643987]
 [0.12509856]
 [0.12680

In [3]:
df_predicted_scores = pd.DataFrame({"Predicted_Similarity_Score": predicted_similarity_scores.flatten()})

# Save the DataFrame to a CSV file
df_predicted_scores.to_csv("predicted_scores.csv", index=False)


In [8]:
# Original code with error
# text2 = text2.tolist()

# Fixed code
text2 = text2

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-no

In [None]:
!jupyter nbconvert --execute --inplace <notebook_name>.ipynb

/bin/bash: line 1: notebook_name: No such file or directory


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from pprint import pprint


In [None]:
model=SentenceTransformer('paraphrase-MiniLM-L6-v2')
sentences = ['dkbfnjnsdmcm','dccdsncmmsdcnldnsljcn','djjdsshgfebsygdjnr','jdcdcnfn']
for sentence , embedding in zip(sentences , model.encode(sentences)):
  print("Sentences:",sentence)
  print("Embidding:",embedding)
  print("")
pprint('Similarity between {} and {} is {}'.format(sentences[0],sentences[1],cosine_similarity(sentence_embeddings[0].reshape(1, -1),sentence_embeddings[1].reshape(1, -1)))[0][0]))

Sentences: dkbfnjnsdmcm
Embidding: [-7.06908166e-01  2.42975384e-01 -4.77411568e-01 -3.34426403e-01
 -2.21699789e-01 -4.28453147e-01 -7.97638893e-02  2.93154418e-01
  2.83038169e-01  9.82700959e-02  4.60342497e-01 -5.57050705e-01
  2.56028265e-01 -3.84910762e-01 -6.16543889e-02  1.10459290e-01
 -5.41010313e-02  1.96656018e-01  1.96700364e-01  2.69891560e-01
 -4.14550692e-01  9.00356919e-02  1.77400395e-01  3.45328078e-02
  1.88699901e-01 -4.23653647e-02  2.20514506e-01  9.33691934e-02
  5.24440035e-02  1.15114830e-01  3.02167088e-01  8.57698083e-01
  6.35086074e-02  2.12872168e-03  5.98978162e-01  3.49051297e-01
 -4.84215170e-01 -1.00023702e-01 -3.37231398e-01 -2.71778435e-01
  1.80415630e-01 -1.70476332e-01  3.78066719e-01  1.00237191e-01
 -2.06863448e-01 -2.36364529e-01 -1.97861582e-01  2.51999319e-01
 -4.97433096e-01  2.48826951e-01  6.41578197e-01 -2.56100893e-01
  3.98233309e-02 -1.40522269e-03  1.02292299e-01 -9.66504142e-02
 -1.35255367e-01  6.67557061e-01  9.40162390e-02  3.995

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_csv("/content/DataNeuron_Text_Similarity.csv")  # Replace "your_dataset.csv" with the actual filename

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to calculate similarity between text1 and text2
def calculate_similarity(row):
    # Encode text1 and text2
    embedding1 = model.encode(row['text1'])
    embedding2 = model.encode(row['text2'])

    # Calculate cosine similarity
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]

    return similarity_score

# Apply the function to each row and store the similarity score in a new column
df['similarity_score'] = df.apply(calculate_similarity, axis=1)

# Display the updated dataset with similarity scores
print(df)


                                                  text1  \
0     broadband challenges tv viewing the number of ...   
1     rap boss arrested over drug find rap mogul mar...   
2     player burn-out worries robinson england coach...   
3     hearts of oak 3-2 cotonsport hearts of oak set...   
4     sir paul rocks super bowl crowds sir paul mcca...   
...                                                 ...   
1389  millions buy mp3 players in us one in 10 adult...   
1390  record year for chilean copper chile s copper ...   
1391  ferguson hails man utd s resolve manchester un...   
1392  franz man seeks government help franz ferdinan...   
1393  thanou bullish over drugs hearing katerina tha...   

                                                  text2  similarity_score  
0     gardener wins double in glasgow britain s jaso...          0.073249  
1     amnesty chief laments war failure the lack of ...          0.181460  
2     hanks greeted at wintry premiere hollywood sta...        

In [None]:
output_filename = "output_dataset.csv"  # Specify the filename for the output CSV file
df.to_csv(output_filename, index=False)

print("DataFrame with similarity scores saved to:", output_filename)

DataFrame with similarity scores saved to: output_dataset.csv
