In [3]:
import sys
import os
from langchain.document_loaders import PyMuPDFLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langchain.embeddings.openai import OpenAIEmbeddings
sys.path.append('../')
from langchain.vectorstores import Chroma
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
# abs path of ../
dir = os.path.abspath("../")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
file_path = os.path.join(dir, "storage/test_pdfs/transformers_vasvani.pdf")
loader = PyMuPDFLoader(file_path)
documents = loader.load()

In [5]:
text_splitter = SentenceTransformersTokenTextSplitter(chunk_size=2048, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
full_doc = "\n".join([text.page_content for text in texts])

In [6]:
text_list = [text.page_content for text in texts]
doc_list = [doc.page_content for doc in documents]

In [7]:
import re
def process_data(data, remove_new_lines=True):
    # Remove Emails
    ans = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    if remove_new_lines:
        ans = [re.sub('\s+', ' ', sent) for sent in ans]

    # Remove distracting single quotes
    ans = [re.sub("\'", "", sent) for sent in ans]

    # remove weird characters
    ans = [re.sub("[^a-zA-Z0-9,\n]+", " ", sent) for sent in ans]
    return ans

In [8]:
doc_list_processed = process_data(doc_list)
text_list_processed = process_data(text_list)
full_doc_processed = " ".join(doc_list_processed)

In [10]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

### Vectordb

In [11]:
texts[0].page_content

'attention is all you need ashish vaswani∗ google brain avaswani @ google. com noam shazeer∗ google brain noam @ google. com niki parmar∗ google research nikip @ google. com jakob uszkoreit∗ google research usz @ google. com llion jones∗ google research llion @ google. com aidan n. gomez∗ † university of toronto aidan @ cs. toronto. edu łukasz kaiser∗ google brain lukaszkaiser @ google. com illia polosukhin∗ ‡ illia. polosukhin @ gmail. com abstract the dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. the best performing models also connect the encoder and decoder through an attention mechanism. we propose a new simple network architecture, the transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signiﬁcantly le

In [68]:
# vectordb = Chroma.from_documents(documents=texts, embedding=SentenceTransformerEmbeddings())
for text in texts:
    text.page_content = process_data([text.page_content])[0]
    
os.environ["OPENAI_API_KEY"] = "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"

In [98]:
vectordb = Chroma.from_documents(documents= texts, embedding=OpenAIEmbeddings())

In [99]:
vectordb.similarity_search_with_score("write a blog post on this", k=5)

[(Document(lc_kwargs={'page_content': 'attention is all you need ashish vaswani google brain avaswani google com noam shazeer google brain noam google com niki parmar google research nikip google com jakob uszkoreit google research usz google com llion jones google research llion google com aidan n gomez university of toronto aidan cs toronto edu ukasz kaiser google brain lukaszkaiser google com illia polosukhin illia polosukhin gmail com abstract the dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder the best performing models also connect the encoder and decoder through an attention mechanism we propose a new simple network architecture, the transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signi cantly less ti

### Zero Shot - doesn't work without finetuning

In [107]:
from transformers import pipeline
mname="facebook/bart-large-mnli"
classifier = pipeline("zero-shot-classification", model=mname)

In [120]:
classifier("Make a quiz with 3 questions based on these pdfs. Add images to make it pretty", ["look in the whole document", "look for a specific part of document"])

{'sequence': 'Make a quiz with 3 questions based on these pdfs. Add images to make it pretty',
 'labels': ['look for a specific part of document',
  'look in the whole document'],
 'scores': [0.7734959721565247, 0.22650405764579773]}

### Clustering

In [92]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode([text.page_content for text in texts], show_progress_bar=True)

Batches: 100%|██████████| 2/2 [00:00<00:00,  2.86it/s]


In [95]:
import umap
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings)

In [98]:
tf_idf, count = c_tf_idf([doc.page_content for doc in documents], m=len(documents))

### BERTopic

In [69]:
from bertopic import BERTopic
from bertopic.dimensionality import BaseDimensionalityReduction
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, OpenAI
import openai
# ctfidf_model = ClassTfidfTransformer()
# topic_model = BERTopic(ctfidf_model=ctfidf_model )
# set openai api key

kbm = KeyBERTInspired()
openai.api_key = os.environ["OPENAI_API_KEY"]
gpt = OpenAI()
# need to fit this to some data...

In [137]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from bertopic.vectorizers import OnlineCountVectorizer, ClassTfidfTransformer

# Prepare sub-models that support online learning
umap_model = IncrementalPCA(n_components=10)
cluster_model = MiniBatchKMeans(n_clusters=5, random_state=0)
vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01)
topic_model = BERTopic(representation_model=kbm,
                    #    ctfidf_model= ClassTfidfTransformer(),
                       umap_model=BaseDimensionalityReduction(),
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model, 
                       n_gram_range=(1, 3),)
topic_model.partial_fit(text_list_processed)

<bertopic._bertopic.BERTopic at 0x29bcf9b20>

In [138]:
len(topic_model.get_topics()), topic_model.generate_topic_labels()

(5,
 ['0_attention_neural_memory',
  '1_models_optimizer_model',
  '2_attentions_attention_layer5',
  '3_attention_encoder_softmax',
  '4_transformer_encoding_parser'])

In [139]:
topic_distr, _ = topic_model.approximate_distribution(text_list_processed)

In [140]:
topic_model.visualize_distribution(topic_distr[1])

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [104]:
topic_model.find_topics("write a blog post on how transformers are trained", top_n=5)

([1, 4, 0, 3, 2], [0.35789427, 0.33362287, 0.32280946, 0.317675, 0.2204592])

In [155]:
# rank text_processed_list based on similarity with query
topic_model.find_topics("write a blog post on how the causal attention works", top_n=5)

([0, 2, 1, 3, 4], [0.3414302, 0.34117335, 0.27029964, 0.22633791, 0.1809462])

In [141]:
topic_model.find_topics(text_list_processed[1], top_n=5)

([0, 3, 4, 1, 2], [0.5033019, 0.44644505, 0.4347789, 0.38372797, 0.37768108])

In [142]:
topic_distr.shape

(36, 5)

In [151]:
T = topic_model.get_document_info(text_list_processed)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index).to_dict()

In [152]:
docs_per_topics

{0: Int64Index([0, 1, 2, 3, 4, 14, 24, 25, 26, 27, 28, 29, 30, 31, 32], dtype='int64'),
 1: Int64Index([16, 19, 20], dtype='int64'),
 2: Int64Index([33, 34, 35], dtype='int64'),
 3: Int64Index([5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17], dtype='int64'),
 4: Int64Index([18, 21, 22, 23], dtype='int64')}

### LDA

In [1]:
import spacy
#loading the english language small model of spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
stopwords = stopwords.union({'from', 'subject', 're', 'edu', 'use', 'et', 'al', 'eos'})

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [9]:
# remove stopwords
import gensim
text_list_processed_gensim = list(sent_to_words(text_list_processed))
words = [[word for word in text if word not in stopwords] for text in text_list_processed_gensim]
len(words), len(text_list_processed_gensim), len(text_list_processed)

(36, 36, 36)

In [10]:
words

[['attention',
  'need',
  'ashish',
  'vaswani',
  'google',
  'brain',
  'avaswani',
  'google',
  'com',
  'noam',
  'shazeer',
  'google',
  'brain',
  'noam',
  'google',
  'com',
  'niki',
  'parmar',
  'google',
  'research',
  'nikip',
  'google',
  'com',
  'jakob',
  'uszkoreit',
  'google',
  'research',
  'usz',
  'google',
  'com',
  'llion',
  'jones',
  'google',
  'research',
  'llion',
  'google',
  'com',
  'aidan',
  'gomez',
  'university',
  'toronto',
  'aidan',
  'cs',
  'toronto',
  'ukasz',
  'kaiser',
  'google',
  'brain',
  'lukaszkaiser',
  'google',
  'com',
  'illia',
  'polosukhin',
  'illia',
  'polosukhin',
  'gmail',
  'com',
  'abstract',
  'dominant',
  'sequence',
  'transduction',
  'models',
  'based',
  'complex',
  'recurrent',
  'convolutional',
  'neural',
  'networks',
  'include',
  'encoder',
  'decoder',
  'best',
  'performing',
  'models',
  'connect',
  'encoder',
  'decoder',
  'attention',
  'mechanism',
  'propose',
  'new',
  'simp

In [11]:
# apply tfidf
from gensim import corpora
from gensim import models
dictionary = corpora.Dictionary(words)
corpus = [dictionary.doc2bow(text) for text in words]

In [127]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
# print top 10 words of tfidf
from pprint import pprint
for doc in corpus_tfidf:
    pprint([(dictionary[w], s) for w , s in sorted(doc, key=lambda x: x[1], reverse=True)[:10]])
    break

[('google', 0.5669326680520711),
 ('com', 0.43503834052287116),
 ('brain', 0.1864450030812305),
 ('polosukhin', 0.15410455943296217),
 ('toronto', 0.15410455943296217),
 ('bleu', 0.12733917080866194),
 ('research', 0.12733917080866194),
 ('aidan', 0.12429666872082032),
 ('illia', 0.12429666872082032),
 ('llion', 0.12429666872082032)]


In [128]:
from pprint import pprint
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=num_topics)

In [129]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.014*"model" + 0.013*"arxiv" + 0.013*"attention" + 0.010*"models" + '
  '0.009*"training" + 0.009*"translation" + 0.009*"neural" + '
  '0.009*"transformer" + 0.008*"english" + 0.008*"sequence"'),
 (1,
  '0.032*"attention" + 0.014*"layer" + 0.011*"decoder" + 0.011*"encoder" + '
  '0.011*"sequence" + 0.011*"output" + 0.010*"layers" + 0.010*"model" + '
  '0.010*"self" + 0.010*"models"'),
 (2,
  '0.015*"attention" + 0.014*"input" + 0.013*"layer" + 0.011*"pad" + '
  '0.010*"model" + 0.008*"models" + 0.008*"opinion" + 0.008*"sequence" + '
  '0.008*"perfect" + 0.008*"missing"'),
 (3,
  '0.014*"attention" + 0.010*"output" + 0.009*"sequence" + 0.008*"layer" + '
  '0.007*"self" + 0.007*"function" + 0.007*"values" + 0.006*"neural" + '
  '0.006*"models" + 0.006*"model"'),
 (4,
  '0.014*"attention" + 0.012*"model" + 0.009*"training" + 0.008*"transformer" '
  '+ 0.007*"english" + 0.006*"models" + 0.006*"neural" + 0.006*"arxiv" + '
  '0.006*"dot" + 0.006*"translation"')]


In [None]:
[lda_model[i] for i in corpus]

In [134]:
topic_data = [" ".join([dictionary[i] for i, _ in lda_model.get_topic_terms(j)]) for j in range(num_topics)]
smart_query = " ".join(topic_data)

In [None]:
vectordb.similarity_search_with_score("Explain the causal attention with images in a blog post", k=5)
# Generate quiz blog post questions pretty image margin

In [None]:
vectordb.similarity_search_with_score(smart_query, k=5)

In [316]:
vectordb.similarity_search_with_score(smart_query, k=5)[0][0].page_content in text_list_processed

True

In [299]:
from gensim import similarities
index = similarities.MatrixSimilarity(lda_model[corpus])
import numpy as np

In [211]:
def hellinger(lda_vec1, lda_vec2, lda_model):
    dense1 = gensim.matutils.sparse2full(lda_vec1, lda_model.num_topics)
    dense2 = gensim.matutils.sparse2full(lda_vec2, lda_model.num_topics)
    return np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2))**2).sum())

def get_most_similar_documents(processed_query, corpus, lda_model):
    query_bow = dictionary.doc2bow(processed_query)
    query_lda = lda_model[query_bow]
    # sort according to hellinger distance
    dists = np.array([hellinger(query_lda, lda_model[corpus[i]], lda_model) for i in range(len(corpus))])
    idx = np.argsort(dists)[:5]
    return idx, dists[idx]

def process_query(query):
    query = process_data([query])[0]
    query = gensim.utils.simple_preprocess(str(query), deacc=True)
    query = [word for word in query if word not in stopwords]
    return query

def process_and_rank(query, corpus, lda_model):
    processed_query = process_query(query)
    print(processed_query)
    return get_most_similar_documents(processed_query, corpus, lda_model)

In [219]:
# query = "Generate a quiz with 3 questions based on these pdfs. Add images to make it pretty"
# query = "Explain the causal attention with images in a blog post"
query = "How do I calculate the attention weights in the transformer model"
query = "Explain how the transformer model works"
# remove stopwords
idx, dists = process_and_rank(query, corpus, lda_model)
[(i, text_list_processed[i]) for i in idx], dists

['explain', 'transformer', 'model', 'works']


([(11,
   'usual learned linear transfor mation and softmax function to convert the decoder output to predicted next token probabilities in our model, we share the same weight matrix between the two embedding layers and the pre softmax linear transformation, similar to 30 in the embedding layers, we multiply those weights by dmodel 3 5 positional encoding since our model contains no recurrence and no convolution, in order for the model to make use of the order of the sequence, we must inject some information about the relative or absolute position of the 5'),
  (32,
   'et al google s neural machine translation system bridging the gap between human and machine translation arxiv preprint arxiv 1609 08144, 2016 39 jie zhou, ying cao, xuguang wang, peng li, and wei xu deep recurrent models with fast forward connections for neural machine translation corr, abs 1606 04199, 2016 40 muhua zhu, yue zhang, wenliang chen, min zhang, and jingbo zhu fast and accurate shift reduce constituent parsi

In [175]:
topic_data

['model arxiv attention models training translation neural transformer english sequence',
 'attention layer decoder encoder sequence output layers model self models',
 'attention input layer pad model models opinion sequence perfect missing',
 'attention output sequence layer self function values neural models model',
 'attention model training transformer english models neural arxiv dot translation']

In [289]:
# call ctfidf on lda topics 
tf_idf, count = c_tf_idf(topic_data, num_topics)
tf_idf # topics x vector

array([[0.05108256, 0.        , 0.        , 0.05108256, 0.05108256],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.16094379],
       [0.09162907, 0.        , 0.        , 0.        , 0.09162907],
       [0.        , 0.09162907, 0.09162907, 0.        , 0.        ],
       [0.02231436, 0.02231436, 0.02231436, 0.        , 0.02231436],
       [0.09162907, 0.09162907, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.05108256, 0.        , 0.05108256, 0.05108256, 0.        ],
       [0.05108256, 0.        , 0.        , 0.05108256, 0.05108256],
       [0.02231436, 0.02231436, 0.        , 0.02231436, 0.02231436],
       [0.        , 0.16094379, 0.        , 0.        , 0.        ],
       [0.        , 0.16094379, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.16094379, 0.        ],
       [0.        , 0.05108256, 0.

### junK

In [285]:
import pandas as pd
df = pd.DataFrame(data, columns=["Doc"])

In [1]:
df

NameError: name 'df' is not defined

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=1)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)