In [None]:
# IF USING GOOGLE COLAB
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')

In [None]:
def extract_text(directory):
    text = []
    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'rb') as f:
                pdf = PyPDF2.PdfFileReader(f)
                for page in range(pdf.getNumPages()):
                    text.append(pdf.getPage(page).extractText())
    return ' '.join(text)


In [None]:
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word.isalpha() and word not in stop_words]
        preprocessed_sentence = ' '.join(words)
        if preprocessed_sentence:
            preprocessed_sentences.append(preprocessed_sentence)
    return preprocessed_sentences


In [None]:
def compute_tfidf_vectors(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_vectors = vectorizer.fit_transform(sentences)
    return tfidf_vectors


In [None]:
def find_similar_sentences(pdf_dir, output_file, threshold=0.7, stop_words=None):
    if not stop_words:
        vectorizer = TfidfVectorizer(stop_words='english')

    sentences = []
    for filename in os.listdir(pdf_dir):
        if filename.endswith('.pdf'):
            filepath = os.path.join(pdf_dir, filename)
            with open(filepath, 'rb') as f:
                pdf_reader = PyPDF2.PdfReader(f)
                for page in pdf_reader.pages:
                    text = page.extract_text()
                    if text:
                        for sentence in sent_tokenize(text):
                            sentences.append(sentence)

    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform(sentences)

    similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

    with open(output_file, 'w') as f:
        for i in range(len(sentences)):
            for j in range(i+1, len(sentences)):
                if similarities[i][j] >= threshold:
                    f.write(f'Similarity Score: {similarities[i][j]}\n')
                    f.write(f'Sentence 1: {sentences[i]}\n')
                    f.write(f'Sentence 2: {sentences[j]}\n')
                    f.write('\n')

In [None]:
find_similar_sentences('/myPdfs', '/myPdfs/output.txt')