<a href="https://colab.research.google.com/github/cur10usityDrives/Text-Similarity-Analysis/blob/main/text_similarity_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('words')
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

english_words = set(nltk.corpus.words.words())

# Preprocess the English words corpus
def preprocess_corpus(corpus):
    preprocessed_corpus = []
    for word in corpus:
        # Tokenize the word (treat each word as a "document")
        tokens = [word]
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        # Stem words
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
        preprocessed_corpus.append(tokens)
    return preprocessed_corpus

preprocessed_corpus = preprocess_corpus(english_words)

def preprocess_text(text, to_stem=True, to_remove_stop_words=True):
    tokens = word_tokenize(text.lower())
    if to_remove_stop_words:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
    if to_stem:
        stemmer = PorterStemmer()
        stem_tokens = [stemmer.stem(token) for token in tokens]
        return ' '.join(stem_tokens)
    else:
        return ' '.join(tokens)

def encode_sentence(sentence, encoding_type='one-hot', to_stem=True, to_remove_stop_words=True):
    processed_sentence = preprocess_text(sentence, to_stem=to_stem, to_remove_stop_words=to_remove_stop_words)
    if encoding_type == 'one-hot':
        vectorizer = CountVectorizer(binary=True)
    elif encoding_type == 'bag-of-words':
        vectorizer = CountVectorizer()
    elif encoding_type == 'tf':
        vectorizer = CountVectorizer()
    elif encoding_type == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Unsupported encoding type")
    encoded_sentence = vectorizer.fit_transform([processed_sentence]).toarray()
    return encoded_sentence.flatten()

def encode_and_compare_sentences(sentence1, sentence2, encoding_type='one-hot', to_stem=True, to_remove_stop_words=True):
    encoded_sentence1 = encode_sentence(sentence1, encoding_type=encoding_type, to_stem=to_stem, to_remove_stop_words=to_remove_stop_words)
    encoded_sentence2 = encode_sentence(sentence2, encoding_type=encoding_type, to_stem=to_stem, to_remove_stop_words=to_remove_stop_words)

    # Pad the shorter sentence with zeros to match the length of the longer one
    max_length = max(len(encoded_sentence1), len(encoded_sentence2))
    encoded_sentence1 = np.pad(encoded_sentence1, (0, max_length - len(encoded_sentence1)))
    encoded_sentence2 = np.pad(encoded_sentence2, (0, max_length - len(encoded_sentence2)))

    similarity = np.dot(encoded_sentence1, encoded_sentence2) / (np.linalg.norm(encoded_sentence1) * np.linalg.norm(encoded_sentence2))
    return similarity

def evaluate_scenarios(scenarios):
    for scenario in scenarios:
        print(f"Scenario: {scenario['name']}")
        pairs = scenario.get('pairs', [])
        for i, pair in enumerate(pairs, 1):
            sentence1 = pair['sentence1']
            sentence2 = pair['sentence2']
            to_stem = scenario.get('to_stem', True)
            to_remove_stop_words = scenario.get('to_remove_stop_words', True)
            encoding_type = scenario['encoding_type']

            # Original form
            similarity_original = encode_and_compare_sentences(sentence1, sentence2, encoding_type=encoding_type, to_stem=False, to_remove_stop_words=False)

            # After stop-word removal/stemming
            sentence1_processed = preprocess_text(sentence1, to_stem=to_stem, to_remove_stop_words=to_remove_stop_words)
            sentence2_processed = preprocess_text(sentence2, to_stem=to_stem, to_remove_stop_words=to_remove_stop_words)
            similarity_processed = encode_and_compare_sentences(sentence1, sentence2, encoding_type=encoding_type, to_stem=to_stem, to_remove_stop_words=to_remove_stop_words)

            print(f"Pair {i}:")
            print(f"Original Similarity: {similarity_original if similarity_original < 1 else round(similarity_original, 2)}")
            print(f"After Stop-word removal/Stemming Similarity: {similarity_processed if similarity_processed < 1 else round(similarity_processed, 2)}")
            print("")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [12]:
# Test scenarios
scenarios = [
    {
        'name': 'Stop-word removal only with ohe',
        'to_stem': False,
        'to_remove_stop_words': True,
        'encoding_type': 'one-hot',
        'pairs': [
            {
                'sentence1': "The cat chased the mouse around the house.",
                'sentence2': "A mouse was not being chased by a cat inside the house."
            },
            {
                'sentence1': "The brown dog is lazy.",
                'sentence2': "This is not a lazy brown dog."
            },
            {
                'sentence1': "The quick brown fox jumps over the lazy dog.",
                'sentence2': "A fast brown fox leaps over a tired dog."
            },
            {
                'sentence1': "This is a very excruciating task targeting the dedicated route.",
                'sentence2': "Excruciating tasks ought to be targeted with dedication."
            }
        ]
    },
    {
        'name': 'Stemming only with ohe',
        'to_stem': True,
        'to_remove_stop_words': False,
        'encoding_type': 'one-hot',
        'pairs': [
            {
                'sentence1': "The cat chased the mouse around the house.",
                'sentence2': "A mouse was not being chased by a cat inside the house."
            },
            {
                'sentence1': "The brown dog is lazy.",
                'sentence2': "This is not a lazy brown dog."
            },
            {
                'sentence1': "The quick brown fox jumps over the lazy dog.",
                'sentence2': "A fast brown fox leaps over a tired dog."
            },
            {
                'sentence1': "This is a very excruciating task targeting the dedicated route.",
                'sentence2': "Excruciating tasks ought to be targeted with dedication."
            }
        ]
    },
    {
        'name': 'Stop-word removal and Stemming with ohe',
        'to_stem': True,
        'to_remove_stop_words': True,
        'encoding_type': 'one-hot',
        'pairs': [
            {
                'sentence1': "The cat chased the mouse around the house.",
                'sentence2': "A mouse was not being chased by a cat inside the house."
            },
            {
                'sentence1': "The brown dog is lazy.",
                'sentence2': "This is not a lazy brown dog."
            },
            {
                'sentence1': "The quick brown fox jumps over the lazy dog.",
                'sentence2': "A fast brown fox leaps over a tired dog."
            },
            {
                'sentence1': "This is a very excruciating task targeting the dedicated route.",
                'sentence2': "Excruciating tasks ought to be targeted with dedication."
            }
        ]
    },
    {
        'name': 'Stop-word removal and Stemming with tfxidf',
        'to_stem': True,
        'to_remove_stop_words': True,
        'encoding_type': 'tfidf',
        'pairs': [
            {
                'sentence1': "The cat chased the mouse around the house.",
                'sentence2': "A mouse was not being chased by a cat inside the house."
            },
            {
                'sentence1': "The brown dog is lazy.",
                'sentence2': "This is not a lazy brown dog."
            },
            {
                'sentence1': "The quick brown fox jumps over the lazy dog.",
                'sentence2': "A fast brown fox leaps over a tired dog."
            },
            {
                'sentence1': "This is a very excruciating task targeting the dedicated route.",
                'sentence2': "Excruciating tasks ought to be targeted with dedication."
            }
        ]
    },
        {
        'name': 'Stop-word removal and Stemming with tf',
        'to_stem': True,
        'to_remove_stop_words': True,
        'encoding_type': 'tf',
        'pairs': [
            {
                'sentence1': "The cat chased the mouse around the house.",
                'sentence2': "A mouse was not being chased by a cat inside the house."
            },
            {
                'sentence1': "The brown dog is lazy.",
                'sentence2': "This is not a lazy brown dog."
            },
            {
                'sentence1': "The quick brown fox jumps over the lazy dog.",
                'sentence2': "A fast brown fox leaps over a tired dog."
            },
            {
                'sentence1': "This is a very excruciating task targeting the dedicated route.",
                'sentence2': "Excruciating tasks ought to be targeted with dedication."
            }
        ]
    },
    {
        'name': 'Stop-word removal and Stemming with bag-of-words',
        'to_stem': True,
        'to_remove_stop_words': True,
        'encoding_type': 'bag-of-words',
        'pairs': [
            {
                'sentence1': "The cat chased the mouse around the house.",
                'sentence2': "A mouse was not being chased by a cat inside the house."
            },
            {
                'sentence1': "The brown dog is lazy.",
                'sentence2': "This is not a lazy brown dog."
            },
            {
                'sentence1': "The quick brown fox jumps over the lazy dog.",
                'sentence2': "A fast brown fox leaps over a tired dog."
            },
            {
                'sentence1': "This is a very excruciating task targeting the dedicated route.",
                'sentence2': "Excruciating tasks ought to be targeted with dedication."
            }
        ]
    }
    # Add more scenarios as needed
]

evaluate_scenarios(scenarios)

Scenario: Stop-word removal only with ohe
Pair 1:
Original Similarity: 0.7745966692414834
After Stop-word removal/Stemming Similarity: 0.9999999999999998

Pair 2:
Original Similarity: 0.9128709291752769
After Stop-word removal/Stemming Similarity: 1.0

Pair 3:
Original Similarity: 0.9354143466934852
After Stop-word removal/Stemming Similarity: 1.0

Pair 4:
Original Similarity: 0.9428090415820632
After Stop-word removal/Stemming Similarity: 0.9999999999999998

Scenario: Stemming only with ohe
Pair 1:
Original Similarity: 0.7745966692414834
After Stop-word removal/Stemming Similarity: 0.7745966692414834

Pair 2:
Original Similarity: 0.9128709291752769
After Stop-word removal/Stemming Similarity: 0.9128709291752769

Pair 3:
Original Similarity: 0.9354143466934852
After Stop-word removal/Stemming Similarity: 0.9354143466934852

Pair 4:
Original Similarity: 0.9428090415820632
After Stop-word removal/Stemming Similarity: 0.9428090415820632

Scenario: Stop-word removal and Stemming with ohe
P