In [17]:
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text and extract features
def extract_features(text):
    # Process the text with spaCy
    doc = nlp(text.lower())

    # Tokenize sentences
    sentences = list(doc.sents)
    sentence_count = len(sentences)

    # Tokenize words and remove stopwords
    words_filtered = [token.text for token in doc if token.is_alpha and not token.is_stop]

    # Calculate lexical diversity
    lexical_diversity = len(set(words_filtered)) / len(words_filtered) if len(words_filtered) > 0 else 0

    # Calculate average sentence length
    avg_sentence_length = len(words_filtered) / sentence_count if sentence_count > 0 else 0

    # Part-of-speech tagging to get noun, verb, adjective counts
    noun_count = sum(1 for token in doc if token.pos_ == 'NOUN')
    verb_count = sum(1 for token in doc if token.pos_ == 'VERB')
    adj_count = sum(1 for token in doc if token.pos_ == 'ADJ')

    # Total count of content words
    total_content_words = len(words_filtered)

    # Calculate percentage of nouns, verbs, and adjectives
    noun_percentage = noun_count / total_content_words if total_content_words > 0 else 0
    verb_percentage = verb_count / total_content_words if total_content_words > 0 else 0
    adj_percentage = adj_count / total_content_words if total_content_words > 0 else 0
    other_percentage = 1 - (noun_percentage + verb_percentage + adj_percentage)

    # Readability score (using average sentence length and average token length)
    syllables_per_word = np.mean([len(token) for token in words_filtered])  # Approximation of syllables per word
    readability_score = avg_sentence_length * syllables_per_word

    # Return all features in a dictionary
    features = {
        'readability_score': readability_score,
        'lexical_diversity': lexical_diversity,
        'sentence_count': sentence_count,
        'avg_sentence_length': avg_sentence_length,
        'noun_percentage': noun_percentage,
        'verb_percentage': verb_percentage,
        'adj_percentage': adj_percentage,
        'other_percentage': other_percentage
    }

    return features

# Hardcoded feature data for pdf1, pdf2, pdf3 (same as in the previous step)
pdf_features = {
    'pdf1': {
        'readability_score': 14.8,
        'lexical_diversity': 0.72,
        'sentence_count': 32,
        'avg_sentence_length': 18.4,
        'noun_percentage': 0.40,
        'verb_percentage': 0.22,
        'adj_percentage': 0.16,
        'other_percentage': 0.22
    },
    'pdf2': {
        'readability_score': 13.5,
        'lexical_diversity': 0.68,
        'sentence_count': 28,
        'avg_sentence_length': 20.2,
        'noun_percentage': 0.35,
        'verb_percentage': 0.25,
        'adj_percentage': 0.20,
        'other_percentage': 0.20
    },
    'pdf3': {
        'readability_score': 12.5,
        'lexical_diversity': 0.56,
        'sentence_count': 45,
        'avg_sentence_length': 15.3,
        'noun_percentage': 0.38,
        'verb_percentage': 0.30,
        'adj_percentage': 0.22,
        'other_percentage': 0.10
    }
}

# Function to calculate similarity between two PDFs
def calculate_similarity(pdf1_features, pdf2_features):
    # Extract features into arrays for comparison
    features1 = np.array([
        pdf1_features['readability_score'],
        pdf1_features['lexical_diversity'],
        pdf1_features['sentence_count'],
        pdf1_features['avg_sentence_length'],
        pdf1_features['noun_percentage'],
        pdf1_features['verb_percentage'],
        pdf1_features['adj_percentage'],
        pdf1_features['other_percentage']
    ]).reshape(1, -1)

    features2 = np.array([
        pdf2_features['readability_score'],
        pdf2_features['lexical_diversity'],
        pdf2_features['sentence_count'],
        pdf2_features['avg_sentence_length'],
        pdf2_features['noun_percentage'],
        pdf2_features['verb_percentage'],
        pdf2_features['adj_percentage'],
        pdf2_features['other_percentage']
    ]).reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(features1, features2)
    return similarity[0][0]

# Function to compare pdf4 with pdf1, pdf2, and pdf3
def compare_with_existing_pdfs(pdf4_text, pdf_features):
    # Extract features for pdf4
    pdf4_features = extract_features(pdf4_text)

    results = {}
    for pdf_name, features in pdf_features.items():
        similarity_score = calculate_similarity(pdf4_features, features)
        results[pdf_name] = similarity_score
    return results

# Input the raw text for pdf4
pdf4_text = """
    [Your raw text here for pdf4. It could be a large block of text extracted from the PDF file.]
"""

# Compare pdf4 with pdf1, pdf2, and pdf3
similarity_results = compare_with_existing_pdfs(pdf4_text, pdf_features)

# Display the results
print("Similarity results for pdf4:")
for pdf_name, similarity_score in similarity_results.items():
    print(f"Similarity with {pdf_name}: {similarity_score:.2f}")

# Determine likelihood of similar authors
threshold = 0.8  # A threshold to decide if the documents are by the same author
for pdf_name, similarity_score in similarity_results.items():
    if similarity_score > threshold:
        print(f"Likelihood of similar authors: High for {pdf_name}")
    else:
        print(f"Likelihood of similar authors: Low for {pdf_name}")

Similarity results for pdf4:
Similarity with pdf1: 0.54
Similarity with pdf2: 0.55
Similarity with pdf3: 0.41
Likelihood of similar authors: Low for pdf1
Likelihood of similar authors: Low for pdf2
Likelihood of similar authors: Low for pdf3


***UNIT TESTING***

In [18]:
import unittest

class TestPDFComparison(unittest.TestCase):

    def test_extract_features(self):
        sample_text = "The quick brown fox jumps over the lazy dog."
        features = extract_features(sample_text)
        self.assertIsInstance(features, dict)
        self.assertIn('readability_score', features)
        self.assertGreater(features['readability_score'], 0)

    def test_calculate_similarity(self):
        pdf1_features = pdf_features['pdf1']
        pdf2_features = pdf_features['pdf2']
        similarity = calculate_similarity(pdf1_features, pdf2_features)
        self.assertGreaterEqual(similarity, 0)
        self.assertLessEqual(similarity, 1)

    def test_compare_with_existing_pdfs(self):
        pdf4_text = "This is a sample text for PDF 4."
        results = compare_with_existing_pdfs(pdf4_text, pdf_features)
        self.assertIsInstance(results, dict)
        for key, value in results.items():
            self.assertIn(key, pdf_features.keys())
            self.assertGreaterEqual(value, 0)
            self.assertLessEqual(value, 1)

# This will run the tests only if the script is executed directly, not when imported
def run_tests():
    suite = unittest.TestLoader().loadTestsFromTestCase(TestPDFComparison)
    unittest.TextTestRunner().run(suite)

run_tests()

...
----------------------------------------------------------------------
Ran 3 tests in 0.033s

OK


***PERFORMANCE TESTING***

In [19]:
import time

def test_performance():
    pdf4_text = " ".join(["This is a test sentence."] * 1000)
    start_time = time.time()
    features = extract_features(pdf4_text)
    print("Feature extraction time:", time.time() - start_time)

    start_time = time.time()
    results = compare_with_existing_pdfs(pdf4_text, pdf_features)
    print("Comparison time:", time.time() - start_time)

if __name__ == '__main__':
    test_performance()

Feature extraction time: 0.946648120880127
Comparison time: 1.3886218070983887


***REGRESSION TESTING***

In [20]:
def test_regression():

    expected_similarity = {
        'pdf1': 0.85,
        'pdf2': 0.77,
        'pdf3': 0.80
    }

    pdf4_text = """
        This is a new document that will be compared against the previous ones. The purpose of this comparison is to identify similarities and differences in the writing style, tone, and structure. It contains multiple sentences, each with its own unique structure and content.
    """

    pdf4_features = extract_features(pdf4_text)
    print(f"Extracted features for pdf4: {pdf4_features}")

    pdf1_features = pdf_features['pdf1']
    print(f"Extracted features for pdf1: {pdf1_features}")

    results = compare_with_existing_pdfs(pdf4_text, pdf_features)

    for pdf, score in expected_similarity.items():
        actual_score = results[pdf]
        print(f"Expected similarity for {pdf}: {score:.2f}, Actual: {actual_score:.2f}")

if __name__ == '__main__':
    test_regression()

Extracted features for pdf4: {'readability_score': 50.333333333333336, 'lexical_diversity': 0.95, 'sentence_count': 3, 'avg_sentence_length': 6.666666666666667, 'noun_percentage': 0.65, 'verb_percentage': 0.15, 'adj_percentage': 0.25, 'other_percentage': -0.050000000000000044}
Extracted features for pdf1: {'readability_score': 14.8, 'lexical_diversity': 0.72, 'sentence_count': 32, 'avg_sentence_length': 18.4, 'noun_percentage': 0.4, 'verb_percentage': 0.22, 'adj_percentage': 0.16, 'other_percentage': 0.22}
Expected similarity for pdf1: 0.85, Actual: 0.48
Expected similarity for pdf2: 0.77, Actual: 0.48
Expected similarity for pdf3: 0.80, Actual: 0.35


***INTEGRATION TESTING***

In [21]:
def test_integration():
    pdf4_text = "Integration test text to evaluate the system end-to-end."
    results = compare_with_existing_pdfs(pdf4_text, pdf_features)
    assert isinstance(results, dict), "Integration test failed: Results should be a dictionary"
    for pdf_name, similarity_score in results.items():
        assert 0 <= similarity_score <= 1, f"Invalid similarity score for {pdf_name}: {similarity_score}"

if __name__ == '__main__':
    test_integration()

***USABILITY TESTING***

In [22]:
def test_usability():
    # Test 1: Empty input
    empty_text = ""
    empty_features = extract_features(empty_text)
    print("Features for empty input:", empty_features)

    # Test 2: Normal input
    normal_text = """
    This is a test document. It contains several sentences with different word structures and parts of speech.
    The purpose of this document is to check the functionality of the feature extraction.
    """
    normal_features = extract_features(normal_text)
    print("Features for normal input:", normal_features)

    # Test 3: Similarity calculation on extracted features
    pdf_features = {
        'pdf1': {
            'readability_score': 14.8,
            'lexical_diversity': 0.72,
            'sentence_count': 32,
            'avg_sentence_length': 18.4,
            'noun_percentage': 0.40,
            'verb_percentage': 0.22,
            'adj_percentage': 0.16,
            'other_percentage': 0.22
        }
    }
    similarity_result = compare_with_existing_pdfs(normal_text, pdf_features)
    print("Similarity results for normal input:", similarity_result)

if __name__ == '__main__':
    test_usability()

Features for empty input: {'readability_score': nan, 'lexical_diversity': 0, 'sentence_count': 0, 'avg_sentence_length': 0, 'noun_percentage': 0, 'verb_percentage': 0, 'adj_percentage': 0, 'other_percentage': 1}
Features for normal input: {'readability_score': 37.666666666666664, 'lexical_diversity': 0.9333333333333333, 'sentence_count': 3, 'avg_sentence_length': 5.0, 'noun_percentage': 0.8, 'verb_percentage': 0.13333333333333333, 'adj_percentage': 0.13333333333333333, 'other_percentage': -0.06666666666666665}
Similarity results for normal input: {'pdf1': 0.49208162344561046}


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
