In [13]:
import sys
import os
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
sys.path.append('../')
# from utils.nlp_trainers import LDATrainer
from utils.nlp_utils import process_data
# abs path of ../
dir = os.path.abspath("../")

In [14]:
import networkx as nx
import spacy
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nlp = spacy.load("en_core_web_sm")
vectorizer = CountVectorizer()

In [19]:
def preprocess_spacy(texts):
    # Preprocess the texts
    sentences = []
    for text in texts:
        doc = nlp(text)
        sentences.extend([sent.text for sent in doc.sents])
    preprocessed_sentences = []
    for sentence in sentences:
        doc = nlp(sentence)
        preprocessed_sentence = ' '.join([token.text for token in doc if not token.is_stop])
        preprocessed_sentences.append(preprocessed_sentence)
    return preprocessed_sentences

def textrank_graph_lda_spacy(texts, n_components=10):
    # Compute the document-term matrix
    dtm = vectorizer.fit_transform(texts)

    # Fit the LDA model
    lda = LatentDirichletAllocation(n_components=n_components)
    lda_matrix = lda.fit_transform(dtm)

    # Compute the similarity matrix
    similarity_matrix = (lda_matrix @ lda_matrix.T)

    # Create the graph
    graph = nx.from_numpy_array(similarity_matrix)

    # Compute the PageRank scores
    scores = nx.pagerank(graph)

    # Sort the sentences by their scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(texts)), reverse=True)

    return graph, ranked_sentences, lda, lda_matrix

In [16]:
file_paths = [os.path.join(dir, "storage/test_pdfs/transformers_vasvani.pdf"),
              os.path.join(dir, "storage/test_pdfs/CS_781_Project.pdf"),
              os.path.join(dir, "storage/test_responses/test_html2pdf_out.pdf"),
              os.path.join(dir, "storage/test_responses/15_06_2023_19_09_38.pdf"),
              os.path.join(dir, "storage/test_responses/15_06_2023_22_05_23.pdf"),
              os.path.join(dir, "storage/test_responses/gpt2.pdf"),]

text_list = []
for file_path in file_paths:
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    text_splitter = SentenceTransformersTokenTextSplitter(chunk_size=2048, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)
    text_list += [text.page_content for text in texts]

text_list_processed = process_data(text_list)
len(text_list_processed)

133

In [17]:
text_list_processed = preprocess_spacy(text_list_processed)

In [20]:
graph, ranked_sentences, lda, lda_matrix = textrank_graph_lda_spacy(text_list_processed)

In [None]:
graph.number_of_nodes()

187

In [23]:
def query_graph(texts, query, k=5):
        # Preprocess the query
    doc = nlp(query)
    preprocessed_query = ' '.join([token.text for token in doc if not token.is_stop])

    # Compute the query vector
    query_vector = vectorizer.transform([preprocessed_query])
    query_topic_distribution = lda.transform(query_vector)

    # Compute the similarity between the query and the sentences
    similarities = cosine_similarity(lda_matrix, query_topic_distribution)

    # Sort the sentences by their similarity to the query
    sorted_indices = np.argsort(similarities, axis=0)[::-1].flatten()
    top_k_sentences = [texts[i] for i in sorted_indices[:k]]

    return top_k_sentences, similarities[sorted_indices[:k]]

In [29]:
query_graph(text_list_processed, "Difference between gpt2 and original transformers", k=10)

(['glue , especially unclear additional',
  'd wear denim , chinos , , recommend wearing variables involved t 3 dress nal',
  'bit traditional , mentioned , s low budget item expensive de nitely issue trying work casual sneakermaker , s ok',
  'know people opposed wearing sneakers turtleneck , wanted true , went slim tting turtleneck color similar favorite color day overall , s classic turtleneck , ashy',
  'ymax ylabel hold entire abstraction condition hold approximations deeppoly reduces problem checking l perturbations , observe phenomenon experiments 2 2 exact solver reluplex exact solver 1 approximation directly feed l1 constraints solver , feeding l1 norm constraints need 2pixels equations practically feasible observe region mere 16 pixels causes memory issues standard computer auxilliary 2',
  'd suggest wear sneakers khaki chino suit white linen suit sure ditch tie wear tee polo wearing tie sneakers bit small model completion t 2 high end t eye storm s hybrid asics streetwear l