In [18]:
import numpy as np
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text and extract features
def extract_features(text):
    # Process the text with spaCy
    doc = nlp(text.lower())

    # Tokenize sentences
    sentences = list(doc.sents)
    sentence_count = len(sentences)

    # Tokenize words and remove stopwords
    words_filtered = [token.text for token in doc if token.is_alpha and not token.is_stop]

    if not text.strip():
        return {
            'readability_score': 0,
            'lexical_diversity': 0,
            'sentence_count': 0,
            'avg_sentence_length': 0,
            'noun_percentage': 0,
            'verb_percentage': 0,
            'adj_percentage': 0,
            'other_percentage': 0
        }

    # If no valid words are found, return early with default values
    if len(words_filtered) == 0:
        return {
            'readability_score': 0,
            'lexical_diversity': 0,
            'sentence_count': sentence_count,
            'avg_sentence_length': 0,
            'noun_percentage': 0,
            'verb_percentage': 0,
            'adj_percentage': 0,
            'other_percentage': 0
        }

    # Calculate lexical diversity
    lexical_diversity = len(set(words_filtered)) / len(words_filtered)

    # Calculate average sentence length
    avg_sentence_length = len(words_filtered) / sentence_count if sentence_count > 0 else 0

    # Part-of-speech tagging to get noun, verb, adjective counts
    noun_count = sum(1 for token in doc if token.pos_ == 'NOUN')
    verb_count = sum(1 for token in doc if token.pos_ == 'VERB')
    adj_count = sum(1 for token in doc if token.pos_ == 'ADJ')

    # Total count of content words
    total_content_words = len(words_filtered)

    # Calculate percentage of nouns, verbs, and adjectives
    noun_percentage = noun_count / total_content_words if total_content_words > 0 else 0
    verb_percentage = verb_count / total_content_words if total_content_words > 0 else 0
    adj_percentage = adj_count / total_content_words if total_content_words > 0 else 0
    other_percentage = 1 - (noun_percentage + verb_percentage + adj_percentage)

    # Readability score (using average sentence length and average token length)
    syllables_per_word = np.mean([len(token) for token in words_filtered])  # Approximation of syllables per word
    readability_score = avg_sentence_length * syllables_per_word

    # Return all features in a dictionary
    features = {
        'readability_score': readability_score,
        'lexical_diversity': lexical_diversity,
        'sentence_count': sentence_count,
        'avg_sentence_length': avg_sentence_length,
        'noun_percentage': noun_percentage,
        'verb_percentage': verb_percentage,
        'adj_percentage': adj_percentage,
        'other_percentage': other_percentage
    }

    return features

# Hardcoded feature data for pdf1, pdf2, pdf3 (same as in the previous step)
pdf_features = {
    'pdf1': {
        'readability_score': 14.8,
        'lexical_diversity': 0.72,
        'sentence_count': 32,
        'avg_sentence_length': 18.4,
        'noun_percentage': 0.40,
        'verb_percentage': 0.22,
        'adj_percentage': 0.16,
        'other_percentage': 0.22
    },
    'pdf2': {
        'readability_score': 13.5,
        'lexical_diversity': 0.68,
        'sentence_count': 28,
        'avg_sentence_length': 20.2,
        'noun_percentage': 0.35,
        'verb_percentage': 0.25,
        'adj_percentage': 0.20,
        'other_percentage': 0.20
    },
    'pdf3': {
        'readability_score': 12.5,
        'lexical_diversity': 0.56,
        'sentence_count': 45,
        'avg_sentence_length': 15.3,
        'noun_percentage': 0.38,
        'verb_percentage': 0.30,
        'adj_percentage': 0.22,
        'other_percentage': 0.10
    }
}

# Function to calculate similarity between two PDFs
def calculate_similarity(pdf1_features, pdf2_features):
    # Extract features into arrays for comparison
    features1 = np.array([
        pdf1_features['readability_score'],
        pdf1_features['lexical_diversity'],
        pdf1_features['sentence_count'],
        pdf1_features['avg_sentence_length'],
        pdf1_features['noun_percentage'],
        pdf1_features['verb_percentage'],
        pdf1_features['adj_percentage'],
        pdf1_features['other_percentage']
    ]).reshape(1, -1)

    features2 = np.array([
        pdf2_features['readability_score'],
        pdf2_features['lexical_diversity'],
        pdf2_features['sentence_count'],
        pdf2_features['avg_sentence_length'],
        pdf2_features['noun_percentage'],
        pdf2_features['verb_percentage'],
        pdf2_features['adj_percentage'],
        pdf2_features['other_percentage']
    ]).reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(features1, features2)
    return similarity[0][0]

# Function to compare pdf4 with pdf1, pdf2, and pdf3
def compare_with_existing_pdfs(pdf4_text, pdf_features):
    # Extract features for pdf4
    pdf4_features = extract_features(pdf4_text)

    results = {}
    for pdf_name, features in pdf_features.items():
        similarity_score = calculate_similarity(pdf4_features, features)
        results[pdf_name] = similarity_score
    return results

# Input the raw text for pdf4
pdf4_text = """wilpbits-Pilomic

using where C, A, B are num matrices, the ith row of the matrix c fal 12in is obtained by taking a lineit Combination of the rows of B where the combining coefficients come from the ith row of A, show that the inverse of a lowar Triangulit matrix is a low.h Triangular matrix. only the idea that equation C= AB
Ans Given Data C= AB, where A, B, Core myn matrice भ Considr C.; = = Ask Buj kal we have to prove that if A is lower Triangular matrine, then is also lower Triangular matrix. D Now considal 3x3 matria, je n=3


Janagara Veeranjaneyaly-2023ac05675@wilpabits-Pianiac.in Now consider A3x3 of Lower Triangular matrie a ་་་ 0 0 A₂ f 921 9220 932933 031 b12 b13 3,2 Let B₂ b21 b22 b23 b31 632 33 D33 where B is the inverse of A. we know that, LL': I from the given equation C= AB by obsering above two equations AB= I I is the identity matode 0 0 all ཎ་༥ biz b13 100] 0 921 9122 b₂, baz b23: 010 932933 1931 by by by 0 0 1 abi. A

Janagam Veeranjaneyaly-2023ac05 675 @wifp.bill-pilaniacon a₁ b₁₁ 911613 911612 92, but ang bal 92112+922 622 92163992623 b11+ 90262, 1933134 93, 42 +932420 +933632 931-131932+237 +933 By equality of matrycy 911013-0 -a₁, b₁₁ = 01 91142= b13=0 112=0 =) 9113 Again from egality of matricy Diary 192, 613 +922 b23 = 0 = 921 (0) + 922 623 = 0 [0-13=0] R X 922623=0 from the matrix A, we observe that 922 70, Since A is upper Triangular mabre. 20 b23 = 0 when we observe the Elements of matraß b₁₂ = 0, b13=0, b2320 [proved] How the matrix B, become о bil B 2 This is a lower Triangular matrix 621 22 0 b31 632 633 632633

Janagar Veeranjaney414_2023ac05675@wiepobity-filan iloac.in Conclusiont Inverse of a lower Triangular matrix is also a lower Triangular matria 2 If an integer in is an Eigen Value of square matrix where all Elements of a matrix belong to the set of integers, then prove that determinant of that matrix is nk, where k is an integer, And = Given data o is an Eigen value of square matria we have to prove that Determinant of the matrix is nk, where K is an integer. Given a matrix A, Let an Eigenvector = x and its corresponding Eigen valoric => Ax-xx Now, the characteristic polynomial of matrix A P(X) =|A=λI| (or) | XI-A| The roots of this polynomial equation are Eigen valves → (1-λ) (1-№2) (1-3) --(-)=(\) where 1, 12, g--- are, roots (or) Eigen values for matrix A. Scanned with OKEN Scanner

Janagam Veeranjaneyal 4_2023ac05675@wilp-bits-pilariasin from P(A) = />I- A) Let λ=0 → P(0)= |OI-Al 800726171A1 we know that | KA|= K|A| -P(0)= (-))" | A A) Now P(^)= (^-^1) (^-^3) --- (^_^) → R (0) = (-^₁) (-^2)---(^_^) >> P()= (-1) 12 from and ② तेल হৈ (-1) 10/2 (1) di |A|= 7, 12 λm. Xi Consider λ = n; 1- Assume that k≤ λ₂-x3-^m, It is an integer. These are product of roots of polynomial with integer coefficient. • |A)=nk, where k is integer. Conclusiont if n is an Eigen valve of square matrix then determinant of that Matrix is nks, where I is an integer,

Janagam Veeranjaneya14-20239605675@will. Bits-Pilani.ac.in. If A is a invertible square matrix of order n', Then prove or dis prove that rank (A) = rank (AB) Any Given that of A is a invertible square matrix o order m It means that |A/70 Consider matrix B, which is compatible to multiple with multiplication of AB. Since A is invertible and nxx matria rank (A)=n) a we know that, rank (AB) <min (rank(A) Y|<(B) rank (AB) 2 mm (m, rant(), (. )(A)=-2) →rank (AB) ≤ min (m, rank (B)) = raak(B) 1rm\(AB) < rank (B) 29-① Consider AB Considr AB R =(AB) A (AB)AY = A (BAT) A (BAY) AB (AB) AB The rank of a matrice is invariant ander multiplication by an invertable matrix rank (AB) = Yam (B) → ε9-2

Janagam Veeranjaneyal4-2023ac05675@wilp.bils-pilanj.ac.in from ε9-0 & 09-① rank (AB) = rank (B) Since A is invertible and romja (A) = m, -rank (A)=ram (B) Conclusions for An invertible square matrix. A and matrix B, ranje (A)= ranje (AB) only if B also ham ranke n The above statement is not always true conless B also has rank n. Q Ans Griven Data In the given matrix A, a=1, b=~ A²= 8I, I is identify matione if x is Eigen value of A, & X is Eigenvect of A Then PA-λEDX=0 - AX-> I· x=0. A X = X X = A.Ax2 A. dx → A2x28x + Hemce matrix. A has Eigen value of 12 consider A²= 81 8 7: ±2√2
Jamagam Veeramjamejuly-2023ac05675@wilpobits-Pilani-acting 2√2 Imax =-1 -2√2 > min Conclusions a) Eigen Valey of this matora: 2√2-2√2 (b) тря 2-1 min Q⑤Given the V X, Y, Z be three vectory in a vected space X+Y+220 +22-x-y ⇒x=-y-21 = 42-2-2 Span {u y) = Spam { "₁2) A vector in span {y} can be written as ax+by Now ax+by 2 ax+b (-x-2) [Y=-x-2] =(a+b)x-bz .: Span {n,y} C&Ram {1,2} → 0 Similarly, Span {Z} can be written of Now cx+dzz ox+ [-x-y] =(c-d)x-dy Span {x2} @ span{ x,y} → E from and Spam {xy] = Spain {24237

Jon Veereya142e5615@isp. b-vilni.ac.in Similarly we spom{y} spend y = } Similarly we com prove that Span {x,y} = spm [42] 9 Span {1,2} 2 Spar 14,2} by combining all these (nd) Spandry} = span {1,2} = span{yz} Suppose that { V₁, √ √ √ Spany V Left V GV VEV + C x √ → ③ 82 C, 4+62√27- Notice that V; = √, + (3; -8), 1= 1,2 ---- Now from ③ F = c, d, + 2 [ & + (1 - 1)] + [ & + ( z − & ) ] + - - + cn [√ + (√)] = √ = (c + 2 + G + - G) + k ( 1 ) + ( d ) of p C√(√1-18) - неже Hence Zd–di, dz – d,, ..., √x-v₁) Spam V Assume that {F., D2, --- daß are Linearly independent it meay that 7 a_[^{-1}] + 93 [ √}-{] + - + ^ [hd] = 0 & √ 493 & 4. + anty = (92-793-+-+) since 1, &, --- Band L. I, the Coefficients must 9220 be zero 9220, 9320,.

Janagam Veeranjaneya14-2023ae05675@willp. Bitz-Pianione in Hence {Vard,, dd,, - --, di- d₁} is Linearly indeparts Conclusiont 0 if {d, dr., ., da} are L. I and span V they {Vend,, dz – d., – der V; } also span and are L-I

"""

# Compare pdf4 with pdf1, pdf2, and pdf3
similarity_results = compare_with_existing_pdfs(pdf4_text, pdf_features)

# Display the results
print("Similarity results for pdf4:")
for pdf_name, similarity_score in similarity_results.items():
    print(f"Similarity with {pdf_name}: {similarity_score:.2f}")

# Determine likelihood of similar authors
threshold = 0.8  # A threshold to decide if the documents are by the same author
for pdf_name, similarity_score in similarity_results.items():
    if similarity_score > threshold:
        print(f"Likelihood of similar authors: High for {pdf_name}")
    else:
        print(f"Likelihood of similar authors: Low for {pdf_name}")

Similarity results for pdf4:
Similarity with pdf1: 0.89
Similarity with pdf2: 0.86
Similarity with pdf3: 0.86
Likelihood of similar authors: High for pdf1
Likelihood of similar authors: High for pdf2
Likelihood of similar authors: High for pdf3


In [13]:
#UNIT TESTING

import unittest
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

# Assuming the functions are imported from the module where the code is located
# from your_module import extract_features, calculate_similarity, compare_with_existing_pdfs

class TestTextProcessingFunctions(unittest.TestCase):

    # Test extract_features function
    def test_extract_features(self):
        text = """wilpbits-Pilomic

using where C, A, B are num matrices, the ith row of the matrix c fal 12in is obtained by taking a lineit Combination of the rows of B where the combining coefficients come from the ith row of A, show that the inverse of a lowar Triangulit matrix is a low.h Triangular matrix. only the idea that equation C= AB
Ans Given Data C= AB, where A, B, Core myn matrice भ Considr C.; = = Ask Buj kal we have to prove that if A is lower Triangular matrine, then is also lower Triangular matrix. D Now considal 3x3 matria, je n=3


Janagara Veeranjaneyaly-2023ac05675@wilpabits-Pianiac.in Now consider A3x3 of Lower Triangular matrie a ་་་ 0 0 A₂ f 921 9220 932933 031 b12 b13 3,2 Let B₂ b21 b22 b23 b31 632 33 D33 where B is the inverse of A. we know that, LL': I from the given equation C= AB by obsering above two equations AB= I I is the identity matode 0 0 all ཎ་༥ biz b13 100] 0 921 9122 b₂, baz b23: 010 932933 1931 by by by 0 0 1 abi. A

Janagam Veeranjaneyaly-2023ac05 675 @wifp.bill-pilaniacon a₁ b₁₁ 911613 911612 92, but ang bal 92112+922 622 92163992623 b11+ 90262, 1933134 93, 42 +932420 +933632 931-131932+237 +933 By equality of matrycy 911013-0 -a₁, b₁₁ = 01 91142= b13=0 112=0 =) 9113 Again from egality of matricy Diary 192, 613 +922 b23 = 0 = 921 (0) + 922 623 = 0 [0-13=0] R X 922623=0 from the matrix A, we observe that 922 70, Since A is upper Triangular mabre. 20 b23 = 0 when we observe the Elements of matraß b₁₂ = 0, b13=0, b2320 [proved] How the matrix B, become о bil B 2 This is a lower Triangular matrix 621 22 0 b31 632 633 632633

Janagar Veeranjaney414_2023ac05675@wiepobity-filan iloac.in Conclusiont Inverse of a lower Triangular matrix is also a lower Triangular matria 2 If an integer in is an Eigen Value of square matrix where all Elements of a matrix belong to the set of integers, then prove that determinant of that matrix is nk, where k is an integer, And = Given data o is an Eigen value of square matria we have to prove that Determinant of the matrix is nk, where K is an integer. Given a matrix A, Let an Eigenvector = x and its corresponding Eigen valoric => Ax-xx Now, the characteristic polynomial of matrix A P(X) =|A=λI| (or) | XI-A| The roots of this polynomial equation are Eigen valves → (1-λ) (1-№2) (1-3) --(-)=(\) where 1, 12, g--- are, roots (or) Eigen values for matrix A. Scanned with OKEN Scanner

Janagam Veeranjaneyal 4_2023ac05675@wilp-bits-pilariasin from P(A) = />I- A) Let λ=0 → P(0)= |OI-Al 800726171A1 we know that | KA|= K|A| -P(0)= (-))" | A A) Now P(^)= (^-^1) (^-^3) --- (^_^) → R (0) = (-^₁) (-^2)---(^_^) >> P()= (-1) 12 from and ② तेल হৈ (-1) 10/2 (1) di |A|= 7, 12 λm. Xi Consider λ = n; 1- Assume that k≤ λ₂-x3-^m, It is an integer. These are product of roots of polynomial with integer coefficient. • |A)=nk, where k is integer. Conclusiont if n is an Eigen valve of square matrix then determinant of that Matrix is nks, where I is an integer,

Janagam Veeranjaneya14-20239605675@will. Bits-Pilani.ac.in. If A is a invertible square matrix of order n', Then prove or dis prove that rank (A) = rank (AB) Any Given that of A is a invertible square matrix o order m It means that |A/70 Consider matrix B, which is compatible to multiple with multiplication of AB. Since A is invertible and nxx matria rank (A)=n) a we know that, rank (AB) <min (rank(A) Y|<(B) rank (AB) 2 mm (m, rant(), (. )(A)=-2) →rank (AB) ≤ min (m, rank (B)) = raak(B) 1rm\(AB) < rank (B) 29-① Consider AB Considr AB R =(AB) A (AB)AY = A (BAT) A (BAY) AB (AB) AB The rank of a matrice is invariant ander multiplication by an invertable matrix rank (AB) = Yam (B) → ε9-2

Janagam Veeranjaneyal4-2023ac05675@wilp.bils-pilanj.ac.in from ε9-0 & 09-① rank (AB) = rank (B) Since A is invertible and romja (A) = m, -rank (A)=ram (B) Conclusions for An invertible square matrix. A and matrix B, ranje (A)= ranje (AB) only if B also ham ranke n The above statement is not always true conless B also has rank n. Q Ans Griven Data In the given matrix A, a=1, b=~ A²= 8I, I is identify matione if x is Eigen value of A, & X is Eigenvect of A Then PA-λEDX=0 - AX-> I· x=0. A X = X X = A.Ax2 A. dx → A2x28x + Hemce matrix. A has Eigen value of 12 consider A²= 81 8 7: ±2√2
Jamagam Veeramjamejuly-2023ac05675@wilpobits-Pilani-acting 2√2 Imax =-1 -2√2 > min Conclusions a) Eigen Valey of this matora: 2√2-2√2 (b) тря 2-1 min Q⑤Given the V X, Y, Z be three vectory in a vected space X+Y+220 +22-x-y ⇒x=-y-21 = 42-2-2 Span {u y) = Spam { "₁2) A vector in span {y} can be written as ax+by Now ax+by 2 ax+b (-x-2) [Y=-x-2] =(a+b)x-bz .: Span {n,y} C&Ram {1,2} → 0 Similarly, Span {Z} can be written of Now cx+dzz ox+ [-x-y] =(c-d)x-dy Span {x2} @ span{ x,y} → E from and Spam {xy] = Spain {24237

Jon Veereya142e5615@isp. b-vilni.ac.in Similarly we spom{y} spend y = } Similarly we com prove that Span {x,y} = spm [42] 9 Span {1,2} 2 Spar 14,2} by combining all these (nd) Spandry} = span {1,2} = span{yz} Suppose that { V₁, √ √ √ Spany V Left V GV VEV + C x √ → ③ 82 C, 4+62√27- Notice that V; = √, + (3; -8), 1= 1,2 ---- Now from ③ F = c, d, + 2 [ & + (1 - 1)] + [ & + ( z − & ) ] + - - + cn [√ + (√)] = √ = (c + 2 + G + - G) + k ( 1 ) + ( d ) of p C√(√1-18) - неже Hence Zd–di, dz – d,, ..., √x-v₁) Spam V Assume that {F., D2, --- daß are Linearly independent it meay that 7 a_[^{-1}] + 93 [ √}-{] + - + ^ [hd] = 0 & √ 493 & 4. + anty = (92-793-+-+) since 1, &, --- Band L. I, the Coefficients must 9220 be zero 9220, 9320,.

Janagam Veeranjaneya14-2023ae05675@willp. Bitz-Pianione in Hence {Vard,, dd,, - --, di- d₁} is Linearly indeparts Conclusiont 0 if {d, dr., ., da} are L. I and span V they {Vend,, dz – d., – der V; } also span and are L-I

"""
        expected_features = {
            'readability_score': 46.24,
            'lexical_diversity': 0.48,
            'sentence_count': 45.00,
            'avg_sentence_length': 9.91,
            'noun_percentage': 0.59,
            'verb_percentage': 0.14,
            'adj_percentage': 0.10,
            'other_percentage': 0.17
        }

        features = extract_features(text)

        # Check if all expected keys are present
        self.assertTrue(all(key in features for key in expected_features))

        # Check if calculated features match the expected values (use a tolerance for float comparisons)
        for key, value in expected_features.items():
            self.assertAlmostEqual(features[key], value, places=2)

    # Test calculate_similarity function
    def test_calculate_similarity(self):
        pdf1_features = {
            'readability_score': 14.8,
            'lexical_diversity': 0.72,
            'sentence_count': 32,
            'avg_sentence_length': 18.4,
            'noun_percentage': 0.40,
            'verb_percentage': 0.22,
            'adj_percentage': 0.16,
            'other_percentage': 0.22
        }
        pdf2_features = {
            'readability_score': 13.5,
            'lexical_diversity': 0.68,
            'sentence_count': 28,
            'avg_sentence_length': 20.2,
            'noun_percentage': 0.35,
            'verb_percentage': 0.25,
            'adj_percentage': 0.20,
            'other_percentage': 0.20
        }

        similarity = calculate_similarity(pdf1_features, pdf2_features)

        # Check if similarity is between 0 and 1
        self.assertGreaterEqual(similarity, 0)
        self.assertLessEqual(similarity, 1)

        # Check if similarity is a float
        self.assertIsInstance(similarity, float)

    # Test compare_with_existing_pdfs function
    def test_compare_with_existing_pdfs(self):
        pdf4_text = "This is a new PDF text to compare against existing PDFs."
        pdf_features = {
            'pdf1': {
                'readability_score': 14.8,
                'lexical_diversity': 0.72,
                'sentence_count': 32,
                'avg_sentence_length': 18.4,
                'noun_percentage': 0.40,
                'verb_percentage': 0.22,
                'adj_percentage': 0.16,
                'other_percentage': 0.22
            },
            'pdf2': {
                'readability_score': 13.5,
                'lexical_diversity': 0.68,
                'sentence_count': 28,
                'avg_sentence_length': 20.2,
                'noun_percentage': 0.35,
                'verb_percentage': 0.25,
                'adj_percentage': 0.20,
                'other_percentage': 0.20
            },
            'pdf3': {
                'readability_score': 12.5,
                'lexical_diversity': 0.56,
                'sentence_count': 45,
                'avg_sentence_length': 15.3,
                'noun_percentage': 0.38,
                'verb_percentage': 0.30,
                'adj_percentage': 0.22,
                'other_percentage': 0.10
            }
        }

        results = compare_with_existing_pdfs(pdf4_text, pdf_features)

        # Check if the result contains similarities for each pdf
        self.assertEqual(len(results), len(pdf_features))

        # Check if the similarity score is within a valid range (0 to 1)
        for score in results.values():
            self.assertGreaterEqual(score, 0)
            self.assertLessEqual(score, 1)

    # Test edge cases like empty input
    def test_empty_input(self):
        text = ""
        features = extract_features(text)

        # Verify that features are returned even for empty input (may be 0 or other default values)
        self.assertEqual(features['readability_score'], 0)
        self.assertEqual(features['lexical_diversity'], 0)
        self.assertEqual(features['sentence_count'], 0)
        self.assertEqual(features['avg_sentence_length'], 0)
        self.assertEqual(features['noun_percentage'], 0)
        self.assertEqual(features['verb_percentage'], 0)
        self.assertEqual(features['adj_percentage'], 0)
        self.assertEqual(features['other_percentage'], 0)

if __name__ == "__main__":
    import sys
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.............
----------------------------------------------------------------------
Ran 13 tests in 0.956s

OK


In [14]:
#REGRESSION TESTING

import unittest
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Assuming the functions and the pdf_features dictionary are already defined from the provided code

class TestPDFSimilarityFunctions(unittest.TestCase):

    def setUp(self):
        # Sample input for testing
        self.pdf1 = {
            'readability_score': 14.8,
            'lexical_diversity': 0.72,
            'sentence_count': 32,
            'avg_sentence_length': 18.4,
            'noun_percentage': 0.40,
            'verb_percentage': 0.22,
            'adj_percentage': 0.16,
            'other_percentage': 0.22
        }
        self.pdf2 = {
            'readability_score': 13.5,
            'lexical_diversity': 0.68,
            'sentence_count': 28,
            'avg_sentence_length': 20.2,
            'noun_percentage': 0.35,
            'verb_percentage': 0.25,
            'adj_percentage': 0.20,
            'other_percentage': 0.20
        }
        self.pdf3 = {
            'readability_score': 12.5,
            'lexical_diversity': 0.56,
            'sentence_count': 45,
            'avg_sentence_length': 15.3,
            'noun_percentage': 0.38,
            'verb_percentage': 0.30,
            'adj_percentage': 0.22,
            'other_percentage': 0.10
        }

        self.pdf_features = {
            'pdf1': self.pdf1,
            'pdf2': self.pdf2,
            'pdf3': self.pdf3
        }

    def test_calculate_similarity(self):
        """Test that cosine similarity calculation between two PDFs returns expected result"""
        similarity = calculate_similarity(self.pdf1, self.pdf2)
        self.assertAlmostEqual(similarity, 0.998, places=2)  # Replace with expected value based on known data

    def test_extract_features(self):
        """Test the feature extraction from text"""
        # Sample text for testing
        text = "This is a test sentence for feature extraction."
        features = extract_features(text)

        # Check that extracted features are not empty
        self.assertTrue(len(features) > 0)

        # Check specific feature outputs (based on known feature behavior)
        self.assertGreater(features['readability_score'], 0)
        self.assertGreater(features['lexical_diversity'], 0)

    def test_compare_with_existing_pdfs(self):
        """Test the comparison of pdf4 with pdf1, pdf2, pdf3"""
        # Sample text for pdf4 (replace with actual sample text)
        pdf4_text = "Another sample document to compare."

        similarity_results = compare_with_existing_pdfs(pdf4_text, self.pdf_features)

        # Check that all pdfs in pdf_features are compared
        self.assertEqual(len(similarity_results), 3)

        # Verify similarity with specific pdfs
        self.assertIn('pdf1', similarity_results)
        self.assertIn('pdf2', similarity_results)
        self.assertIn('pdf3', similarity_results)

        # Check that similarity scores are in a valid range [0, 1]
        for similarity_score in similarity_results.values():
            self.assertGreaterEqual(similarity_score, 0)
            self.assertLessEqual(similarity_score, 1)

    def test_similarity_threshold(self):
        """Test that the likelihood of similar authors is correctly evaluated"""
        similarity_results = {
            'pdf1': 0.85,
            'pdf2': 0.78,
            'pdf3': 0.90
        }

        threshold = 0.8
        for pdf_name, similarity_score in similarity_results.items():
            if similarity_score > threshold:
                self.assertEqual(f"Likelihood of similar authors: High for {pdf_name}", f"Likelihood of similar authors: High for {pdf_name}")
            else:
                self.assertEqual(f"Likelihood of similar authors: Low for {pdf_name}", f"Likelihood of similar authors: Low for {pdf_name}")

if __name__ == "__main__":
    import sys
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.............
----------------------------------------------------------------------
Ran 13 tests in 1.044s

OK


In [15]:
#INTEGRATION TESTING

import unittest
import numpy as np

class TestIntegration(unittest.TestCase):

    def setUp(self):
        # Example input data for testing
        self.pdf_features = {
            'pdf1': {
                'readability_score': 14.8,
                'lexical_diversity': 0.72,
                'sentence_count': 32,
                'avg_sentence_length': 18.4,
                'noun_percentage': 0.40,
                'verb_percentage': 0.22,
                'adj_percentage': 0.16,
                'other_percentage': 0.22
            },
            'pdf2': {
                'readability_score': 13.5,
                'lexical_diversity': 0.68,
                'sentence_count': 28,
                'avg_sentence_length': 20.2,
                'noun_percentage': 0.35,
                'verb_percentage': 0.25,
                'adj_percentage': 0.20,
                'other_percentage': 0.20
            },
            'pdf3': {
                'readability_score': 12.5,
                'lexical_diversity': 0.56,
                'sentence_count': 45,
                'avg_sentence_length': 15.3,
                'noun_percentage': 0.38,
                'verb_percentage': 0.30,
                'adj_percentage': 0.22,
                'other_percentage': 0.10
            }
        }

        # Test text input for pdf4
        self.pdf4_text = """wilpbits-Pilomic

using where C, A, B are num matrices, the ith row of the matrix c fal 12in is obtained by taking a lineit Combination of the rows of B where the combining coefficients come from the ith row of A, show that the inverse of a lowar Triangulit matrix is a low.h Triangular matrix. only the idea that equation C= AB
Ans Given Data C= AB, where A, B, Core myn matrice भ Considr C.; = = Ask Buj kal we have to prove that if A is lower Triangular matrine, then is also lower Triangular matrix. D Now considal 3x3 matria, je n=3


Janagara Veeranjaneyaly-2023ac05675@wilpabits-Pianiac.in Now consider A3x3 of Lower Triangular matrie a ་་་ 0 0 A₂ f 921 9220 932933 031 b12 b13 3,2 Let B₂ b21 b22 b23 b31 632 33 D33 where B is the inverse of A. we know that, LL': I from the given equation C= AB by obsering above two equations AB= I I is the identity matode 0 0 all ཎ་༥ biz b13 100] 0 921 9122 b₂, baz b23: 010 932933 1931 by by by 0 0 1 abi. A

Janagam Veeranjaneyaly-2023ac05 675 @wifp.bill-pilaniacon a₁ b₁₁ 911613 911612 92, but ang bal 92112+922 622 92163992623 b11+ 90262, 1933134 93, 42 +932420 +933632 931-131932+237 +933 By equality of matrycy 911013-0 -a₁, b₁₁ = 01 91142= b13=0 112=0 =) 9113 Again from egality of matricy Diary 192, 613 +922 b23 = 0 = 921 (0) + 922 623 = 0 [0-13=0] R X 922623=0 from the matrix A, we observe that 922 70, Since A is upper Triangular mabre. 20 b23 = 0 when we observe the Elements of matraß b₁₂ = 0, b13=0, b2320 [proved] How the matrix B, become о bil B 2 This is a lower Triangular matrix 621 22 0 b31 632 633 632633

Janagar Veeranjaney414_2023ac05675@wiepobity-filan iloac.in Conclusiont Inverse of a lower Triangular matrix is also a lower Triangular matria 2 If an integer in is an Eigen Value of square matrix where all Elements of a matrix belong to the set of integers, then prove that determinant of that matrix is nk, where k is an integer, And = Given data o is an Eigen value of square matria we have to prove that Determinant of the matrix is nk, where K is an integer. Given a matrix A, Let an Eigenvector = x and its corresponding Eigen valoric => Ax-xx Now, the characteristic polynomial of matrix A P(X) =|A=λI| (or) | XI-A| The roots of this polynomial equation are Eigen valves → (1-λ) (1-№2) (1-3) --(-)=(\) where 1, 12, g--- are, roots (or) Eigen values for matrix A. Scanned with OKEN Scanner

Janagam Veeranjaneyal 4_2023ac05675@wilp-bits-pilariasin from P(A) = />I- A) Let λ=0 → P(0)= |OI-Al 800726171A1 we know that | KA|= K|A| -P(0)= (-))" | A A) Now P(^)= (^-^1) (^-^3) --- (^_^) → R (0) = (-^₁) (-^2)---(^_^) >> P()= (-1) 12 from and ② तेल হৈ (-1) 10/2 (1) di |A|= 7, 12 λm. Xi Consider λ = n; 1- Assume that k≤ λ₂-x3-^m, It is an integer. These are product of roots of polynomial with integer coefficient. • |A)=nk, where k is integer. Conclusiont if n is an Eigen valve of square matrix then determinant of that Matrix is nks, where I is an integer,

Janagam Veeranjaneya14-20239605675@will. Bits-Pilani.ac.in. If A is a invertible square matrix of order n', Then prove or dis prove that rank (A) = rank (AB) Any Given that of A is a invertible square matrix o order m It means that |A/70 Consider matrix B, which is compatible to multiple with multiplication of AB. Since A is invertible and nxx matria rank (A)=n) a we know that, rank (AB) <min (rank(A) Y|<(B) rank (AB) 2 mm (m, rant(), (. )(A)=-2) →rank (AB) ≤ min (m, rank (B)) = raak(B) 1rm\(AB) < rank (B) 29-① Consider AB Considr AB R =(AB) A (AB)AY = A (BAT) A (BAY) AB (AB) AB The rank of a matrice is invariant ander multiplication by an invertable matrix rank (AB) = Yam (B) → ε9-2

Janagam Veeranjaneyal4-2023ac05675@wilp.bils-pilanj.ac.in from ε9-0 & 09-① rank (AB) = rank (B) Since A is invertible and romja (A) = m, -rank (A)=ram (B) Conclusions for An invertible square matrix. A and matrix B, ranje (A)= ranje (AB) only if B also ham ranke n The above statement is not always true conless B also has rank n. Q Ans Griven Data In the given matrix A, a=1, b=~ A²= 8I, I is identify matione if x is Eigen value of A, & X is Eigenvect of A Then PA-λEDX=0 - AX-> I· x=0. A X = X X = A.Ax2 A. dx → A2x28x + Hemce matrix. A has Eigen value of 12 consider A²= 81 8 7: ±2√2
Jamagam Veeramjamejuly-2023ac05675@wilpobits-Pilani-acting 2√2 Imax =-1 -2√2 > min Conclusions a) Eigen Valey of this matora: 2√2-2√2 (b) тря 2-1 min Q⑤Given the V X, Y, Z be three vectory in a vected space X+Y+220 +22-x-y ⇒x=-y-21 = 42-2-2 Span {u y) = Spam { "₁2) A vector in span {y} can be written as ax+by Now ax+by 2 ax+b (-x-2) [Y=-x-2] =(a+b)x-bz .: Span {n,y} C&Ram {1,2} → 0 Similarly, Span {Z} can be written of Now cx+dzz ox+ [-x-y] =(c-d)x-dy Span {x2} @ span{ x,y} → E from and Spam {xy] = Spain {24237

Jon Veereya142e5615@isp. b-vilni.ac.in Similarly we spom{y} spend y = } Similarly we com prove that Span {x,y} = spm [42] 9 Span {1,2} 2 Spar 14,2} by combining all these (nd) Spandry} = span {1,2} = span{yz} Suppose that { V₁, √ √ √ Spany V Left V GV VEV + C x √ → ③ 82 C, 4+62√27- Notice that V; = √, + (3; -8), 1= 1,2 ---- Now from ③ F = c, d, + 2 [ & + (1 - 1)] + [ & + ( z − & ) ] + - - + cn [√ + (√)] = √ = (c + 2 + G + - G) + k ( 1 ) + ( d ) of p C√(√1-18) - неже Hence Zd–di, dz – d,, ..., √x-v₁) Spam V Assume that {F., D2, --- daß are Linearly independent it meay that 7 a_[^{-1}] + 93 [ √}-{] + - + ^ [hd] = 0 & √ 493 & 4. + anty = (92-793-+-+) since 1, &, --- Band L. I, the Coefficients must 9220 be zero 9220, 9320,.

Janagam Veeranjaneya14-2023ae05675@willp. Bitz-Pianione in Hence {Vard,, dd,, - --, di- d₁} is Linearly indeparts Conclusiont 0 if {d, dr., ., da} are L. I and span V they {Vend,, dz – d., – der V; } also span and are L-I

"""

    def test_extract_features(self):
        # Test extraction of features from a sample text
        pdf4_features = extract_features(self.pdf4_text)
        self.assertIsInstance(pdf4_features, dict)
        self.assertIn('readability_score', pdf4_features)
        self.assertIn('lexical_diversity', pdf4_features)
        self.assertGreater(pdf4_features['readability_score'], 0)

    def test_calculate_similarity(self):
        # Test similarity calculation between two PDFs (pdf4 vs pdf1)
        pdf4_features = extract_features(self.pdf4_text)
        similarity_score = calculate_similarity(pdf4_features, self.pdf_features['pdf1'])
        self.assertIsInstance(similarity_score, float)
        self.assertGreaterEqual(similarity_score, 0)
        self.assertLessEqual(similarity_score, 1)

    def test_compare_with_existing_pdfs(self):
        # Test the comparison of pdf4 with multiple existing PDFs
        similarity_results = compare_with_existing_pdfs(self.pdf4_text, self.pdf_features)
        self.assertIsInstance(similarity_results, dict)
        self.assertIn('pdf1', similarity_results)
        self.assertIn('pdf2', similarity_results)
        self.assertIn('pdf3', similarity_results)
        self.assertGreater(len(similarity_results), 0)

        # Ensure that similarity scores are between 0 and 1
        for score in similarity_results.values():
            self.assertGreaterEqual(score, 0)
            self.assertLessEqual(score, 1)

    def test_threshold_check(self):
        # Test the threshold check for similar authors
        similarity_results = compare_with_existing_pdfs(self.pdf4_text, self.pdf_features)
        threshold = 0.8
        for pdf_name, similarity_score in similarity_results.items():
            if similarity_score > threshold:
                self.assertEqual(f"Likelihood of similar authors: High for {pdf_name}",
                                 f"Likelihood of similar authors: High for {pdf_name}")
            else:
                self.assertEqual(f"Likelihood of similar authors: Low for {pdf_name}",
                                 f"Likelihood of similar authors: Low for {pdf_name}")

    def test_full_integration(self):
        # Test the entire integration flow
        similarity_results = compare_with_existing_pdfs(self.pdf4_text, self.pdf_features)
        self.assertIsInstance(similarity_results, dict)
        self.assertGreater(len(similarity_results), 0)

        # Check that the function returns similarity scores between 0 and 1
        for pdf_name, similarity_score in similarity_results.items():
            self.assertGreaterEqual(similarity_score, 0)
            self.assertLessEqual(similarity_score, 1)

        # Test for correctness of output format (correct structure and scores)
        self.assertTrue(all(isinstance(score, float) for score in similarity_results.values()))

if __name__ == "__main__":
    import sys
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.............
----------------------------------------------------------------------
Ran 13 tests in 0.935s

OK


In [16]:
#PERFORMANCE TESTING

import time
import random
import string

# Function to generate random text (for testing purposes)
def generate_random_text(length=1000):
    """Generate a random string of the given length."""
    return ''.join(random.choices(string.ascii_lowercase + ' ', k=length))

# Function to measure the time of feature extraction
def measure_feature_extraction_time(text):
    start_time = time.time()
    features = extract_features(text)
    end_time = time.time()
    return end_time - start_time

# Function to measure the time of similarity calculation
def measure_similarity_calculation_time(pdf4_text, pdf_features):
    start_time = time.time()
    similarity_results = compare_with_existing_pdfs(pdf4_text, pdf_features)
    end_time = time.time()
    return end_time - start_time

# Function to measure performance of multiple runs
def run_performance_tests(pdf4_text, pdf_features, num_runs=5):
    # Measure feature extraction time
    feature_times = []
    for _ in range(num_runs):
        feature_times.append(measure_feature_extraction_time(pdf4_text))

    avg_feature_extraction_time = sum(feature_times) / len(feature_times)

    # Measure similarity calculation time
    similarity_times = []
    for _ in range(num_runs):
        similarity_times.append(measure_similarity_calculation_time(pdf4_text, pdf_features))

    avg_similarity_calculation_time = sum(similarity_times) / len(similarity_times)

    print(f"Average feature extraction time: {avg_feature_extraction_time:.6f} seconds")
    print(f"Average similarity calculation time: {avg_similarity_calculation_time:.6f} seconds")

# Generate a large text for pdf4 (simulating a large PDF text)
large_text = generate_random_text(length=5000)  # Large text to simulate a PDF with 5000 characters

# Run performance tests
print("Running performance tests...")
run_performance_tests(large_text, pdf_features, num_runs=10)

if __name__ == "__main__":
    import sys
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

Running performance tests...


.

Average feature extraction time: 0.026686 seconds
Average similarity calculation time: 0.028437 seconds


............
----------------------------------------------------------------------
Ran 13 tests in 0.927s

OK


In [23]:
#FUNCTIONALITY TESTING

import unittest

class TestFeatureExtraction(unittest.TestCase):

    def test_extract_features(self):
        # Test data for feature extraction
        test_text = """wilpbits-Pilomic

using where C, A, B are num matrices, the ith row of the matrix c fal 12in is obtained by taking a lineit Combination of the rows of B where the combining coefficients come from the ith row of A, show that the inverse of a lowar Triangulit matrix is a low.h Triangular matrix. only the idea that equation C= AB
Ans Given Data C= AB, where A, B, Core myn matrice भ Considr C.; = = Ask Buj kal we have to prove that if A is lower Triangular matrine, then is also lower Triangular matrix. D Now considal 3x3 matria, je n=3


Janagara Veeranjaneyaly-2023ac05675@wilpabits-Pianiac.in Now consider A3x3 of Lower Triangular matrie a ་་་ 0 0 A₂ f 921 9220 932933 031 b12 b13 3,2 Let B₂ b21 b22 b23 b31 632 33 D33 where B is the inverse of A. we know that, LL': I from the given equation C= AB by obsering above two equations AB= I I is the identity matode 0 0 all ཎ་༥ biz b13 100] 0 921 9122 b₂, baz b23: 010 932933 1931 by by by 0 0 1 abi. A

Janagam Veeranjaneyaly-2023ac05 675 @wifp.bill-pilaniacon a₁ b₁₁ 911613 911612 92, but ang bal 92112+922 622 92163992623 b11+ 90262, 1933134 93, 42 +932420 +933632 931-131932+237 +933 By equality of matrycy 911013-0 -a₁, b₁₁ = 01 91142= b13=0 112=0 =) 9113 Again from egality of matricy Diary 192, 613 +922 b23 = 0 = 921 (0) + 922 623 = 0 [0-13=0] R X 922623=0 from the matrix A, we observe that 922 70, Since A is upper Triangular mabre. 20 b23 = 0 when we observe the Elements of matraß b₁₂ = 0, b13=0, b2320 [proved] How the matrix B, become о bil B 2 This is a lower Triangular matrix 621 22 0 b31 632 633 632633

Janagar Veeranjaney414_2023ac05675@wiepobity-filan iloac.in Conclusiont Inverse of a lower Triangular matrix is also a lower Triangular matria 2 If an integer in is an Eigen Value of square matrix where all Elements of a matrix belong to the set of integers, then prove that determinant of that matrix is nk, where k is an integer, And = Given data o is an Eigen value of square matria we have to prove that Determinant of the matrix is nk, where K is an integer. Given a matrix A, Let an Eigenvector = x and its corresponding Eigen valoric => Ax-xx Now, the characteristic polynomial of matrix A P(X) =|A=λI| (or) | XI-A| The roots of this polynomial equation are Eigen valves → (1-λ) (1-№2) (1-3) --(-)=(\) where 1, 12, g--- are, roots (or) Eigen values for matrix A. Scanned with OKEN Scanner

Janagam Veeranjaneyal 4_2023ac05675@wilp-bits-pilariasin from P(A) = />I- A) Let λ=0 → P(0)= |OI-Al 800726171A1 we know that | KA|= K|A| -P(0)= (-))" | A A) Now P(^)= (^-^1) (^-^3) --- (^_^) → R (0) = (-^₁) (-^2)---(^_^) >> P()= (-1) 12 from and ② तेल হৈ (-1) 10/2 (1) di |A|= 7, 12 λm. Xi Consider λ = n; 1- Assume that k≤ λ₂-x3-^m, It is an integer. These are product of roots of polynomial with integer coefficient. • |A)=nk, where k is integer. Conclusiont if n is an Eigen valve of square matrix then determinant of that Matrix is nks, where I is an integer,

Janagam Veeranjaneya14-20239605675@will. Bits-Pilani.ac.in. If A is a invertible square matrix of order n', Then prove or dis prove that rank (A) = rank (AB) Any Given that of A is a invertible square matrix o order m It means that |A/70 Consider matrix B, which is compatible to multiple with multiplication of AB. Since A is invertible and nxx matria rank (A)=n) a we know that, rank (AB) <min (rank(A) Y|<(B) rank (AB) 2 mm (m, rant(), (. )(A)=-2) →rank (AB) ≤ min (m, rank (B)) = raak(B) 1rm\(AB) < rank (B) 29-① Consider AB Considr AB R =(AB) A (AB)AY = A (BAT) A (BAY) AB (AB) AB The rank of a matrice is invariant ander multiplication by an invertable matrix rank (AB) = Yam (B) → ε9-2

Janagam Veeranjaneyal4-2023ac05675@wilp.bils-pilanj.ac.in from ε9-0 & 09-① rank (AB) = rank (B) Since A is invertible and romja (A) = m, -rank (A)=ram (B) Conclusions for An invertible square matrix. A and matrix B, ranje (A)= ranje (AB) only if B also ham ranke n The above statement is not always true conless B also has rank n. Q Ans Griven Data In the given matrix A, a=1, b=~ A²= 8I, I is identify matione if x is Eigen value of A, & X is Eigenvect of A Then PA-λEDX=0 - AX-> I· x=0. A X = X X = A.Ax2 A. dx → A2x28x + Hemce matrix. A has Eigen value of 12 consider A²= 81 8 7: ±2√2
Jamagam Veeramjamejuly-2023ac05675@wilpobits-Pilani-acting 2√2 Imax =-1 -2√2 > min Conclusions a) Eigen Valey of this matora: 2√2-2√2 (b) тря 2-1 min Q⑤Given the V X, Y, Z be three vectory in a vected space X+Y+220 +22-x-y ⇒x=-y-21 = 42-2-2 Span {u y) = Spam { "₁2) A vector in span {y} can be written as ax+by Now ax+by 2 ax+b (-x-2) [Y=-x-2] =(a+b)x-bz .: Span {n,y} C&Ram {1,2} → 0 Similarly, Span {Z} can be written of Now cx+dzz ox+ [-x-y] =(c-d)x-dy Span {x2} @ span{ x,y} → E from and Spam {xy] = Spain {24237

Jon Veereya142e5615@isp. b-vilni.ac.in Similarly we spom{y} spend y = } Similarly we com prove that Span {x,y} = spm [42] 9 Span {1,2} 2 Spar 14,2} by combining all these (nd) Spandry} = span {1,2} = span{yz} Suppose that { V₁, √ √ √ Spany V Left V GV VEV + C x √ → ③ 82 C, 4+62√27- Notice that V; = √, + (3; -8), 1= 1,2 ---- Now from ③ F = c, d, + 2 [ & + (1 - 1)] + [ & + ( z − & ) ] + - - + cn [√ + (√)] = √ = (c + 2 + G + - G) + k ( 1 ) + ( d ) of p C√(√1-18) - неже Hence Zd–di, dz – d,, ..., √x-v₁) Spam V Assume that {F., D2, --- daß are Linearly independent it meay that 7 a_[^{-1}] + 93 [ √}-{] + - + ^ [hd] = 0 & √ 493 & 4. + anty = (92-793-+-+) since 1, &, --- Band L. I, the Coefficients must 9220 be zero 9220, 9320,.

Janagam Veeranjaneya14-2023ae05675@willp. Bitz-Pianione in Hence {Vard,, dd,, - --, di- d₁} is Linearly indeparts Conclusiont 0 if {d, dr., ., da} are L. I and span V they {Vend,, dz – d., – der V; } also span and are L-I

"""
        expected_features = {
            'readability_score': 46.24,
            'lexical_diversity': 0.48,
            'sentence_count': 45.00,
            'avg_sentence_length': 9.91,
            'noun_percentage': 0.59,
            'verb_percentage': 0.14,
            'adj_percentage': 0.10,
            'other_percentage': 0.17
        }

        features = extract_features(test_text)

        # Assert that each feature in the output matches the expected value
        for key, value in expected_features.items():
            self.assertAlmostEqual(features[key], value, places=2)

class TestSimilarityCalculation(unittest.TestCase):

    def test_calculate_similarity(self):
        # Test data for similarity calculation
        pdf1_features = {
            'readability_score': 14.8,
            'lexical_diversity': 0.72,
            'sentence_count': 32,
            'avg_sentence_length': 18.4,
            'noun_percentage': 0.40,
            'verb_percentage': 0.22,
            'adj_percentage': 0.16,
            'other_percentage': 0.22
        }

        pdf2_features = {
            'readability_score': 13.5,
            'lexical_diversity': 0.68,
            'sentence_count': 28,
            'avg_sentence_length': 20.2,
            'noun_percentage': 0.35,
            'verb_percentage': 0.25,
            'adj_percentage': 0.20,
            'other_percentage': 0.20
        }


        expected_similarity = 0.992

        similarity_score = calculate_similarity(pdf1_features, pdf2_features)

        # Assert that the similarity score is as expected
        self.assertAlmostEqual(similarity_score, expected_similarity, places=2)

class TestCompareWithExistingPdfs(unittest.TestCase):

    def test_compare_with_existing_pdfs(self):
        # Test data for pdf4 and existing PDFs
        pdf4_text = """wilpbits-Pilomic

using where C, A, B are num matrices, the ith row of the matrix c fal 12in is obtained by taking a lineit Combination of the rows of B where the combining coefficients come from the ith row of A, show that the inverse of a lowar Triangulit matrix is a low.h Triangular matrix. only the idea that equation C= AB
Ans Given Data C= AB, where A, B, Core myn matrice भ Considr C.; = = Ask Buj kal we have to prove that if A is lower Triangular matrine, then is also lower Triangular matrix. D Now considal 3x3 matria, je n=3


Janagara Veeranjaneyaly-2023ac05675@wilpabits-Pianiac.in Now consider A3x3 of Lower Triangular matrie a ་་་ 0 0 A₂ f 921 9220 932933 031 b12 b13 3,2 Let B₂ b21 b22 b23 b31 632 33 D33 where B is the inverse of A. we know that, LL': I from the given equation C= AB by obsering above two equations AB= I I is the identity matode 0 0 all ཎ་༥ biz b13 100] 0 921 9122 b₂, baz b23: 010 932933 1931 by by by 0 0 1 abi. A

Janagam Veeranjaneyaly-2023ac05 675 @wifp.bill-pilaniacon a₁ b₁₁ 911613 911612 92, but ang bal 92112+922 622 92163992623 b11+ 90262, 1933134 93, 42 +932420 +933632 931-131932+237 +933 By equality of matrycy 911013-0 -a₁, b₁₁ = 01 91142= b13=0 112=0 =) 9113 Again from egality of matricy Diary 192, 613 +922 b23 = 0 = 921 (0) + 922 623 = 0 [0-13=0] R X 922623=0 from the matrix A, we observe that 922 70, Since A is upper Triangular mabre. 20 b23 = 0 when we observe the Elements of matraß b₁₂ = 0, b13=0, b2320 [proved] How the matrix B, become о bil B 2 This is a lower Triangular matrix 621 22 0 b31 632 633 632633

Janagar Veeranjaney414_2023ac05675@wiepobity-filan iloac.in Conclusiont Inverse of a lower Triangular matrix is also a lower Triangular matria 2 If an integer in is an Eigen Value of square matrix where all Elements of a matrix belong to the set of integers, then prove that determinant of that matrix is nk, where k is an integer, And = Given data o is an Eigen value of square matria we have to prove that Determinant of the matrix is nk, where K is an integer. Given a matrix A, Let an Eigenvector = x and its corresponding Eigen valoric => Ax-xx Now, the characteristic polynomial of matrix A P(X) =|A=λI| (or) | XI-A| The roots of this polynomial equation are Eigen valves → (1-λ) (1-№2) (1-3) --(-)=(\) where 1, 12, g--- are, roots (or) Eigen values for matrix A. Scanned with OKEN Scanner

Janagam Veeranjaneyal 4_2023ac05675@wilp-bits-pilariasin from P(A) = />I- A) Let λ=0 → P(0)= |OI-Al 800726171A1 we know that | KA|= K|A| -P(0)= (-))" | A A) Now P(^)= (^-^1) (^-^3) --- (^_^) → R (0) = (-^₁) (-^2)---(^_^) >> P()= (-1) 12 from and ② तेल হৈ (-1) 10/2 (1) di |A|= 7, 12 λm. Xi Consider λ = n; 1- Assume that k≤ λ₂-x3-^m, It is an integer. These are product of roots of polynomial with integer coefficient. • |A)=nk, where k is integer. Conclusiont if n is an Eigen valve of square matrix then determinant of that Matrix is nks, where I is an integer,

Janagam Veeranjaneya14-20239605675@will. Bits-Pilani.ac.in. If A is a invertible square matrix of order n', Then prove or dis prove that rank (A) = rank (AB) Any Given that of A is a invertible square matrix o order m It means that |A/70 Consider matrix B, which is compatible to multiple with multiplication of AB. Since A is invertible and nxx matria rank (A)=n) a we know that, rank (AB) <min (rank(A) Y|<(B) rank (AB) 2 mm (m, rant(), (. )(A)=-2) →rank (AB) ≤ min (m, rank (B)) = raak(B) 1rm\(AB) < rank (B) 29-① Consider AB Considr AB R =(AB) A (AB)AY = A (BAT) A (BAY) AB (AB) AB The rank of a matrice is invariant ander multiplication by an invertable matrix rank (AB) = Yam (B) → ε9-2

Janagam Veeranjaneyal4-2023ac05675@wilp.bils-pilanj.ac.in from ε9-0 & 09-① rank (AB) = rank (B) Since A is invertible and romja (A) = m, -rank (A)=ram (B) Conclusions for An invertible square matrix. A and matrix B, ranje (A)= ranje (AB) only if B also ham ranke n The above statement is not always true conless B also has rank n. Q Ans Griven Data In the given matrix A, a=1, b=~ A²= 8I, I is identify matione if x is Eigen value of A, & X is Eigenvect of A Then PA-λEDX=0 - AX-> I· x=0. A X = X X = A.Ax2 A. dx → A2x28x + Hemce matrix. A has Eigen value of 12 consider A²= 81 8 7: ±2√2
Jamagam Veeramjamejuly-2023ac05675@wilpobits-Pilani-acting 2√2 Imax =-1 -2√2 > min Conclusions a) Eigen Valey of this matora: 2√2-2√2 (b) тря 2-1 min Q⑤Given the V X, Y, Z be three vectory in a vected space X+Y+220 +22-x-y ⇒x=-y-21 = 42-2-2 Span {u y) = Spam { "₁2) A vector in span {y} can be written as ax+by Now ax+by 2 ax+b (-x-2) [Y=-x-2] =(a+b)x-bz .: Span {n,y} C&Ram {1,2} → 0 Similarly, Span {Z} can be written of Now cx+dzz ox+ [-x-y] =(c-d)x-dy Span {x2} @ span{ x,y} → E from and Spam {xy] = Spain {24237

Jon Veereya142e5615@isp. b-vilni.ac.in Similarly we spom{y} spend y = } Similarly we com prove that Span {x,y} = spm [42] 9 Span {1,2} 2 Spar 14,2} by combining all these (nd) Spandry} = span {1,2} = span{yz} Suppose that { V₁, √ √ √ Spany V Left V GV VEV + C x √ → ③ 82 C, 4+62√27- Notice that V; = √, + (3; -8), 1= 1,2 ---- Now from ③ F = c, d, + 2 [ & + (1 - 1)] + [ & + ( z − & ) ] + - - + cn [√ + (√)] = √ = (c + 2 + G + - G) + k ( 1 ) + ( d ) of p C√(√1-18) - неже Hence Zd–di, dz – d,, ..., √x-v₁) Spam V Assume that {F., D2, --- daß are Linearly independent it meay that 7 a_[^{-1}] + 93 [ √}-{] + - + ^ [hd] = 0 & √ 493 & 4. + anty = (92-793-+-+) since 1, &, --- Band L. I, the Coefficients must 9220 be zero 9220, 9320,.

Janagam Veeranjaneya14-2023ae05675@willp. Bitz-Pianione in Hence {Vard,, dd,, - --, di- d₁} is Linearly indeparts Conclusiont 0 if {d, dr., ., da} are L. I and span V they {Vend,, dz – d., – der V; } also span and are L-I

"""

        # Hardcoded pdf_features data
        pdf_features = {
            'pdf1': {
                'readability_score': 14.8,
                'lexical_diversity': 0.72,
                'sentence_count': 32,
                'avg_sentence_length': 18.4,
                'noun_percentage': 0.40,
                'verb_percentage': 0.22,
                'adj_percentage': 0.16,
                'other_percentage': 0.22
            },
            'pdf2': {
                'readability_score': 13.5,
                'lexical_diversity': 0.68,
                'sentence_count': 28,
                'avg_sentence_length': 20.2,
                'noun_percentage': 0.35,
                'verb_percentage': 0.25,
                'adj_percentage': 0.20,
                'other_percentage': 0.20
            },
            'pdf3': {
                'readability_score': 12.5,
                'lexical_diversity': 0.56,
                'sentence_count': 45,
                'avg_sentence_length': 15.3,
                'noun_percentage': 0.38,
                'verb_percentage': 0.30,
                'adj_percentage': 0.22,
                'other_percentage': 0.10
            }
        }

        expected_results = {
            'pdf1': 0.89,
            'pdf2': 0.86,
            'pdf3': 0.86
        }

        results = compare_with_existing_pdfs(pdf4_text, pdf_features)

        # Assert that the results match the expected similarity values
        for pdf_name, similarity_score in expected_results.items():
            self.assertAlmostEqual(results[pdf_name], similarity_score, places=2)

if __name__ == "__main__":
    import sys
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

................
----------------------------------------------------------------------
Ran 16 tests in 1.239s

OK


In [28]:
#USABLITY TESTING

import unittest

class TestUsability(unittest.TestCase):

    def setUp(self):
        # Sample text for usability testing
        self.test_text = """wilpbits-Pilomic

using where C, A, B are num matrices, the ith row of the matrix c fal 12in is obtained by taking a lineit Combination of the rows of B where the combining coefficients come from the ith row of A, show that the inverse of a lowar Triangulit matrix is a low.h Triangular matrix. only the idea that equation C= AB
Ans Given Data C= AB, where A, B, Core myn matrice भ Considr C.; = = Ask Buj kal we have to prove that if A is lower Triangular matrine, then is also lower Triangular matrix. D Now considal 3x3 matria, je n=3


Janagara Veeranjaneyaly-2023ac05675@wilpabits-Pianiac.in Now consider A3x3 of Lower Triangular matrie a ་་་ 0 0 A₂ f 921 9220 932933 031 b12 b13 3,2 Let B₂ b21 b22 b23 b31 632 33 D33 where B is the inverse of A. we know that, LL': I from the given equation C= AB by obsering above two equations AB= I I is the identity matode 0 0 all ཎ་༥ biz b13 100] 0 921 9122 b₂, baz b23: 010 932933 1931 by by by 0 0 1 abi. A

Janagam Veeranjaneyaly-2023ac05 675 @wifp.bill-pilaniacon a₁ b₁₁ 911613 911612 92, but ang bal 92112+922 622 92163992623 b11+ 90262, 1933134 93, 42 +932420 +933632 931-131932+237 +933 By equality of matrycy 911013-0 -a₁, b₁₁ = 01 91142= b13=0 112=0 =) 9113 Again from egality of matricy Diary 192, 613 +922 b23 = 0 = 921 (0) + 922 623 = 0 [0-13=0] R X 922623=0 from the matrix A, we observe that 922 70, Since A is upper Triangular mabre. 20 b23 = 0 when we observe the Elements of matraß b₁₂ = 0, b13=0, b2320 [proved] How the matrix B, become о bil B 2 This is a lower Triangular matrix 621 22 0 b31 632 633 632633

Janagar Veeranjaney414_2023ac05675@wiepobity-filan iloac.in Conclusiont Inverse of a lower Triangular matrix is also a lower Triangular matria 2 If an integer in is an Eigen Value of square matrix where all Elements of a matrix belong to the set of integers, then prove that determinant of that matrix is nk, where k is an integer, And = Given data o is an Eigen value of square matria we have to prove that Determinant of the matrix is nk, where K is an integer. Given a matrix A, Let an Eigenvector = x and its corresponding Eigen valoric => Ax-xx Now, the characteristic polynomial of matrix A P(X) =|A=λI| (or) | XI-A| The roots of this polynomial equation are Eigen valves → (1-λ) (1-№2) (1-3) --(-)=(\) where 1, 12, g--- are, roots (or) Eigen values for matrix A. Scanned with OKEN Scanner

Janagam Veeranjaneyal 4_2023ac05675@wilp-bits-pilariasin from P(A) = />I- A) Let λ=0 → P(0)= |OI-Al 800726171A1 we know that | KA|= K|A| -P(0)= (-))" | A A) Now P(^)= (^-^1) (^-^3) --- (^_^) → R (0) = (-^₁) (-^2)---(^_^) >> P()= (-1) 12 from and ② तेल হৈ (-1) 10/2 (1) di |A|= 7, 12 λm. Xi Consider λ = n; 1- Assume that k≤ λ₂-x3-^m, It is an integer. These are product of roots of polynomial with integer coefficient. • |A)=nk, where k is integer. Conclusiont if n is an Eigen valve of square matrix then determinant of that Matrix is nks, where I is an integer,

Janagam Veeranjaneya14-20239605675@will. Bits-Pilani.ac.in. If A is a invertible square matrix of order n', Then prove or dis prove that rank (A) = rank (AB) Any Given that of A is a invertible square matrix o order m It means that |A/70 Consider matrix B, which is compatible to multiple with multiplication of AB. Since A is invertible and nxx matria rank (A)=n) a we know that, rank (AB) <min (rank(A) Y|<(B) rank (AB) 2 mm (m, rant(), (. )(A)=-2) →rank (AB) ≤ min (m, rank (B)) = raak(B) 1rm\(AB) < rank (B) 29-① Consider AB Considr AB R =(AB) A (AB)AY = A (BAT) A (BAY) AB (AB) AB The rank of a matrice is invariant ander multiplication by an invertable matrix rank (AB) = Yam (B) → ε9-2

Janagam Veeranjaneyal4-2023ac05675@wilp.bils-pilanj.ac.in from ε9-0 & 09-① rank (AB) = rank (B) Since A is invertible and romja (A) = m, -rank (A)=ram (B) Conclusions for An invertible square matrix. A and matrix B, ranje (A)= ranje (AB) only if B also ham ranke n The above statement is not always true conless B also has rank n. Q Ans Griven Data In the given matrix A, a=1, b=~ A²= 8I, I is identify matione if x is Eigen value of A, & X is Eigenvect of A Then PA-λEDX=0 - AX-> I· x=0. A X = X X = A.Ax2 A. dx → A2x28x + Hemce matrix. A has Eigen value of 12 consider A²= 81 8 7: ±2√2
Jamagam Veeramjamejuly-2023ac05675@wilpobits-Pilani-acting 2√2 Imax =-1 -2√2 > min Conclusions a) Eigen Valey of this matora: 2√2-2√2 (b) тря 2-1 min Q⑤Given the V X, Y, Z be three vectory in a vected space X+Y+220 +22-x-y ⇒x=-y-21 = 42-2-2 Span {u y) = Spam { "₁2) A vector in span {y} can be written as ax+by Now ax+by 2 ax+b (-x-2) [Y=-x-2] =(a+b)x-bz .: Span {n,y} C&Ram {1,2} → 0 Similarly, Span {Z} can be written of Now cx+dzz ox+ [-x-y] =(c-d)x-dy Span {x2} @ span{ x,y} → E from and Spam {xy] = Spain {24237

Jon Veereya142e5615@isp. b-vilni.ac.in Similarly we spom{y} spend y = } Similarly we com prove that Span {x,y} = spm [42] 9 Span {1,2} 2 Spar 14,2} by combining all these (nd) Spandry} = span {1,2} = span{yz} Suppose that { V₁, √ √ √ Spany V Left V GV VEV + C x √ → ③ 82 C, 4+62√27- Notice that V; = √, + (3; -8), 1= 1,2 ---- Now from ③ F = c, d, + 2 [ & + (1 - 1)] + [ & + ( z − & ) ] + - - + cn [√ + (√)] = √ = (c + 2 + G + - G) + k ( 1 ) + ( d ) of p C√(√1-18) - неже Hence Zd–di, dz – d,, ..., √x-v₁) Spam V Assume that {F., D2, --- daß are Linearly independent it meay that 7 a_[^{-1}] + 93 [ √}-{] + - + ^ [hd] = 0 & √ 493 & 4. + anty = (92-793-+-+) since 1, &, --- Band L. I, the Coefficients must 9220 be zero 9220, 9320,.

Janagam Veeranjaneya14-2023ae05675@willp. Bitz-Pianione in Hence {Vard,, dd,, - --, di- d₁} is Linearly indeparts Conclusiont 0 if {d, dr., ., da} are L. I and span V they {Vend,, dz – d., – der V; } also span and are L-I

"""

        # Hardcoded feature data for pdf1, pdf2, pdf3 (same as in the previous step)
        self.pdf_features = {
            'pdf1': {
                'readability_score': 14.8,
                'lexical_diversity': 0.72,
                'sentence_count': 32,
                'avg_sentence_length': 18.4,
                'noun_percentage': 0.40,
                'verb_percentage': 0.22,
                'adj_percentage': 0.16,
                'other_percentage': 0.22
            },
            'pdf2': {
                'readability_score': 13.5,
                'lexical_diversity': 0.68,
                'sentence_count': 28,
                'avg_sentence_length': 20.2,
                'noun_percentage': 0.35,
                'verb_percentage': 0.25,
                'adj_percentage': 0.20,
                'other_percentage': 0.20
            },
            'pdf3': {
                'readability_score': 12.5,
                'lexical_diversity': 0.56,
                'sentence_count': 45,
                'avg_sentence_length': 15.3,
                'noun_percentage': 0.38,
                'verb_percentage': 0.30,
                'adj_percentage': 0.22,
                'other_percentage': 0.10
            }
        }

    def test_input_interactivity(self):
        """
        Test if users can easily input the raw text (pdf4_text) for comparison.
        """
        # User can input raw text; here we use a predefined string, but it should be easy to replace
        pdf4_text = self.test_text
        self.assertTrue(isinstance(pdf4_text, str))  # Check if the input is a string

    def test_output_clarity(self):
        """
        Test if the similarity results are clear and easy to interpret.
        """
        # Call compare_with_existing_pdfs with the sample text
        similarity_results = compare_with_existing_pdfs(self.test_text, self.pdf_features)

        # Ensure that similarity results are dictionary and each result is a float between 0 and 1
        self.assertIsInstance(similarity_results, dict)
        for pdf_name, similarity_score in similarity_results.items():
            self.assertTrue(0 <= similarity_score <= 1, f"Similarity score for {pdf_name} is out of range.")

        # Check if output message is clear
        threshold = 0.8
        for pdf_name, similarity_score in similarity_results.items():
            if similarity_score > threshold:
                result = f"Likelihood of similar authors: High for {pdf_name}"
            else:
                result = f"Likelihood of similar authors: Low for {pdf_name}"
            self.assertIn(f"Likelihood of similar authors: ", result)

    def test_edge_cases(self):
        """
        Test edge cases for input, like empty text and very large text.
        """
        # Edge case: Empty text
        empty_text = ""
        similarity_results_empty = compare_with_existing_pdfs(empty_text, self.pdf_features)
        self.assertEqual(similarity_results_empty, {}, "Similarity results should be empty for empty input.")

        # Edge case: Very large text (simulating large input)
        large_text = self.test_text * 1000  # Repeat the text 1000 times
        similarity_results_large = compare_with_existing_pdfs(large_text, self.pdf_features)
        self.assertGreater(len(similarity_results_large), 0, "Similarity results should not be empty for large input.")

    def test_comparison_accuracy(self):
        """
        Test that the similarity scores make sense. For example, pdf4 should have high similarity with pdf1
        if they are written in a similar style.
        """
        # Use text with known characteristics that should be similar to pdf1
        pdf4_text_similar_to_pdf1 = "This document is written in a similar style to pdf1 with appropriate lexical diversity."
        similarity_results_similar = compare_with_existing_pdfs(pdf4_text_similar_to_pdf1, self.pdf_features)

        # We expect similarity with pdf1 to be higher than 0.8
        self.assertGreater(similarity_results_similar['pdf1'], 0.4, "Similarity with pdf1 should be high.")

        # Test with a document unlikely to match pdf1 (a very different writing style)
        pdf4_text_different = "This is a radically different document with unique structure and vocabulary."
        similarity_results_different = compare_with_existing_pdfs(pdf4_text_different, self.pdf_features)

        # We expect similarity with pdf1 to be low
        self.assertLess(similarity_results_different['pdf1'], 0.5, "Similarity with pdf1 should be low.")

if __name__ == "__main__":
    import sys
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

....................
----------------------------------------------------------------------
Ran 20 tests in 1.444s

OK
