In [1]:
import pandas as pd
import langchain
from langchain.vectorstores import FAISS
from langchain.document_loaders import (
    TextLoader,
    NotebookLoader,
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader,
    UnstructuredWordDocumentLoader,
    
)
from langchain.text_splitter import CharacterTextSplitter
from tqdm import tqdm


from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
import faiss
from pathlib import Path

from langchain.text_splitter import RecursiveCharacterTextSplitter
import utils
from collections import Counter
import re
import json
import torch
from langchain.embeddings import OpenAIEmbeddings


In [2]:
from loadingDocuments import read_data

# Read data

In [3]:
qa_dataset_path = Path('eval_data/evaluation_dataset.csv')
read_data(qa_dataset_path)

Unnamed: 0,Question,Answer,Source
0,What is PyTorch?,It’s a Python based scientific computing packa...,notebook 3_1
1,What is the MNIST dataset?,MNIST is a dataset that is often used for benc...,notebook 3_4
2,Which optimizers are mentioned in the exercise...,Optimizer and learning rate:\nSGD + Momentum: ...,notebook 3_4
3,Describe the model given in the exercise noteb...,The provided code defines a PyTorch neural net...,notebook 3_3
4,What is the initial assignment in exercise not...,The first task is to use Kaiming He initializa...,notebook 3_4
5,What do we expect to learn from week4?,"In this lab, we will learn how to create your ...","notebook 4_1, notebook 4_2"
6,What is CIFAR-10 dataset?,The images in CIFAR-10 are RGB images (3 chann...,notebook 4_2
7,What are convolutional neural networks?,The standard ConvNets are organised into layer...,notebook 4_1
8,Can you provide some suggestions to improve th...,Tell us something like increase the depth of t...,notebook 4_2
9,What do RNN and LSTM stand for?,RNN stands for Reccurent Neural Network and LS...,notebook 5_1


# Experimenting with splits

## test/split course description

In [4]:
# get all files name from .\knowledgeBase\GeneralInformation
path = Path(".\knowledgeBase\GeneralInformation")
files = path.glob('**/*.txt')
files = [str(x) for x in files]
print(files)
total_info_docs = []
for file in files:
    loader = TextLoader(file)
    total_info_docs.append(loader.load()[0])

['knowledgeBase\\GeneralInformation\\CourseOutline.txt', 'knowledgeBase\\GeneralInformation\\CoursePlan.txt', 'knowledgeBase\\GeneralInformation\\LearningObjectives.txt']


In [5]:
for doc in total_info_docs:
    print(f"Doc: {doc.metadata['source']}  has {utils.get_tokens_count(doc.page_content)} tokens")


Doc: knowledgeBase\GeneralInformation\CourseOutline.txt  has 274 tokens
Doc: knowledgeBase\GeneralInformation\CoursePlan.txt  has 4477 tokens
Doc: knowledgeBase\GeneralInformation\LearningObjectives.txt  has 174 tokens


In [6]:
chunk_tokens = 350
overlap_tokens = 50
text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_tokens,
            chunk_overlap=overlap_tokens,
            length_function=utils.get_tokens_count, #len
            add_start_index=True
        )

In [7]:
total_info_chunks =  []
for doc in total_info_docs:
    chunks = text_splitter.split_documents([doc])
    for idx, chunk in enumerate(chunks):
        chunk.metadata['id'] = idx
    total_info_chunks.extend(chunks)
    print("Info doc {} has {} chunks".format(doc.metadata['source'], len(chunks)))

Info doc knowledgeBase\GeneralInformation\CourseOutline.txt has 1 chunks
Info doc knowledgeBase\GeneralInformation\CoursePlan.txt has 18 chunks
Info doc knowledgeBase\GeneralInformation\LearningObjectives.txt has 1 chunks


## test/split notebooks

In [8]:
notebook_paths = [
    r"knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.1-what-is-pytorch.ipynb",
    r"knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.2-automatic-differentiation.ipynb",
    r"knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.3-FFN-Half-Moon.ipynb",
    r"knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.4-EXE-FFN-MNIST.ipynb",
    r"knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.1-CNN-Introduction.ipynb",
    r"knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.2-EXE-CNN-CIFAR-10.ipynb",
    r"knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.3-CNN-transfer.ipynb",
    r"knowledgeBase\Week5-Transformers-and-recurrent-neural-networks\Notebooks\5_1_EXE_deep_learning_with_transformers.ipynb",
    r"knowledgeBase\Week5-Transformers-and-recurrent-neural-networks\Notebooks\5_2_Recurrent_Neural_Networks_Nanograd.ipynb",
    r"knowledgeBase\Week5-Transformers-and-recurrent-neural-networks\Notebooks\5_3-Recurrent-Neural-Networks-Numpy.ipynb",
    r"knowledgeBase\Week7-Un-and-semi-supervised-learning\Notebooks\7.1-autoencoder.ipynb",
    r"knowledgeBase\Week7-Un-and-semi-supervised-learning\Notebooks\7.2-EXE-variational-autoencoder.ipynb",
    r"knowledgeBase\Week7-Un-and-semi-supervised-learning\Notebooks\7.3-generative-adversarial-networks.ipynb",
    r"knowledgeBase\Week7-Un-and-semi-supervised-learning\Notebooks\7.4-SUPP-flow-models.ipynb",
    #r"knowledgeBase\Week8-Reinforcement-learning\Notebooks\8.1_Introduction.ipynb",
    r"knowledgeBase\Week8-Reinforcement-learning\Notebooks\8.2_Prerequisites.ipynb",
    r"knowledgeBase\Week8-Reinforcement-learning\Notebooks\8.3-EXE_Policy_Gradient.ipynb",
    r"knowledgeBase\Week8-Reinforcement-learning\Notebooks\8.4_Q-Network.ipynb",
    r"knowledgeBase\Week8-Reinforcement-learning\Notebooks\8.5_Deep_Q-network.ipynb"
]

In [9]:
total_notebook_docs = []
for notebook_path in notebook_paths:
    notebook_path = Path(notebook_path)

    loader = NotebookLoader(str(notebook_path), include_outputs=False, max_output_length=20, remove_newline=True)
    doc_notebook = loader.load()[0]
    total_notebook_docs.append(doc_notebook)
    print("Notebok: {} has {} tokens".format(notebook_path, utils.get_tokens_count(doc_notebook.page_content)))

Notebok: knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.1-what-is-pytorch.ipynb has 1451 tokens
Notebok: knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.2-automatic-differentiation.ipynb has 1154 tokens
Notebok: knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.3-FFN-Half-Moon.ipynb has 4221 tokens
Notebok: knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.4-EXE-FFN-MNIST.ipynb has 3678 tokens
Notebok: knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.1-CNN-Introduction.ipynb has 3552 tokens
Notebok: knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.2-EXE-CNN-CIFAR-10.ipynb has 3298 tokens
Notebok: knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.3-CNN-transfer.ipynb has 3939 tokens
Notebok: knowledgeBase\Week5-Transformers-and-recurrent-neural-networks\Notebooks\5_1_EXE_deep_learning_with_transformers.ipynb has 19766 tokens
Notebok: knowledgeBase\Week5-Transfor

  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_data = filtered_data.applymap(remove_newlines)
  filtered_dat

In [10]:
def parse_notebook_content(content):

    cells = re.split(r"\'(markdown|code)\' cell: ", content)[1:]

    parsed_content = []
    for i in range(0, len(cells), 2):
        cell_type = cells[i]
        cell_content = cells[i + 1]

        

        cell_content = cells[i + 1][3:-20] + cells[i + 1][-20:].replace("']'", "'")
        cell_items_list = cell_content.replace("\\n","").split("', '") 
        
        # Append the cell type and content to the parsed content
        parsed_content.append({'type': cell_type, 'content': cell_items_list})
    return parsed_content


def get_parsed_notebook_text(parsed_content):
    parsed_text = []
    for item in parsed_content:
        parsed_text.append(item['type']+ ":" + "\n")
        for content_text in item['content']:
            parsed_text.append(content_text + "\n")
    return "".join(parsed_text)

In [12]:
# for notebook_doc in total_notebook_docs:
#     notebook_doc.page_content = get_parsed_notebook_text(parse_notebook_content(notebook_doc.page_content))

In [13]:
total_notebook_chunks =  []
for doc in total_notebook_docs:
    chunks = text_splitter.split_documents([doc])
    for idx, chunk in enumerate(chunks):
        chunk.metadata['id'] = idx
    total_notebook_chunks.extend(chunks)
    print("Notebook {} has {} chunks".format(doc.metadata['source'], len(chunks)))

Notebook knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.1-what-is-pytorch.ipynb has 6 chunks
Notebook knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.2-automatic-differentiation.ipynb has 5 chunks
Notebook knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.3-FFN-Half-Moon.ipynb has 18 chunks
Notebook knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks\3.4-EXE-FFN-MNIST.ipynb has 17 chunks
Notebook knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.1-CNN-Introduction.ipynb has 18 chunks
Notebook knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.2-EXE-CNN-CIFAR-10.ipynb has 15 chunks
Notebook knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks\4.3-CNN-transfer.ipynb has 20 chunks
Notebook knowledgeBase\Week5-Transformers-and-recurrent-neural-networks\Notebooks\5_1_EXE_deep_learning_with_transformers.ipynb has 84 chunks
Notebook knowledgeBase\Week5-Transformers-and-recurrent-

# Create vector store

## Openai embeddings

In [14]:
# from loadingDocuments import Loading_files
# text_splitter = CharacterTextSplitter(
#     separator="\n\n",
#     chunk_size=1000,
#     chunk_overlap=100,
#     length_function=len,
#     add_start_index=True
# )
# folder_paths = [
#     r"knowledgeBase\GeneralInformation",
#     r"knowledgeBase\Week3-Feed-forward-NeuralNetworks-in-PyTorch\Notebooks",
#     r"knowledgeBase\Week4-Convolutional-Neural-Networks\Notebooks",
#     r"knowledgeBase\Week5-Transformers-and-recurrent-neural-networks\Notebooks"
#     ]
# all_documents = []

# for folder_path in folder_paths:
#     filePaths = Loading_files(folder_path)
#     loaders = [read_data(file_path) for file_path in filePaths]

#     for loader in loaders:
#         print("Loading raw document..." + loader.file_path)
#         raw_documents = loader.load()

#         print("Splitting text...")
#         text_splitter = CharacterTextSplitter(
#             separator="\n\n",
#             chunk_size=1000,
#             chunk_overlap=100,
#             length_function=len,
#             add_start_index=True
#         )
#         documents = text_splitter.split_documents(raw_documents)
#         for idx, chunk in enumerate(documents):
#             chunk.metadata['id'] = idx
#         all_documents.extend(documents)


# embeddings = OpenAIEmbeddings()
# vectorstore = FAISS.from_documents(all_documents,embeddings)


In [15]:
hf_embedder = OpenAIEmbeddings()
model_name="openai"

## Setup transformer

In [16]:
# print(torch.cuda.is_available())  # Should return True if CUDA is available

# if torch.cuda.is_available():
#     torch.set_default_tensor_type(torch.cuda.FloatTensor)
#     print("Using CUDA")

In [17]:
# #model_name = "sentence-transformers/all-mpnet-base-v2"
# model_name = "BAAI/bge-small-en-v1.5"
# hf_embedder = HuggingFaceEmbeddings(model_name=model_name) # "sentence-transformers/all-MiniLM-L6-v2"
# #if torch.cuda.is_available():
# #    hf_embedder = hf_embedder.to('cuda')
# model_name = model_name.split("/")[-1]

## Set up vector store

In [18]:
total_info_chunks.extend(total_notebook_chunks)
vectorstore = FAISS.from_documents(total_info_chunks, hf_embedder)

In [19]:
vectorstore_name = "faiss_{}_{}_{}_not-cleaned-notebook-contents-v1".format(chunk_tokens,overlap_tokens,model_name)
vector_store_path = "vector_stores" + "/" + vectorstore_name
vectorstore.save_local(vector_store_path)

In [20]:
vectorstore.similarity_search_with_score("What is PyTorch?", k=5)

[(Document(page_content="we will show you:', '* How to represent sequences of categorical variables', '* How to build and train an RNN in NumPy', '* How to build and train an LSTM network in NumPy', '* How to build and train an LSTM network in PyTorch']'", metadata={'source': 'knowledgeBase\\Week5-Transformers-and-recurrent-neural-networks\\Notebooks\\5_3-Recurrent-Neural-Networks-Numpy.ipynb', 'start_index': 1345, 'id': 1}),
  0.4372698),
 (Document(page_content='The purpose of this course is to give the student a detailed understanding of the deep artificial neural network models, their training, computational frameworks for deployment on fast graphical processing units, their limitations and how to formulate learning in a diverse range of settings. These settings include classification, regression, sequences and other types of structured input and outputs and for reasoning in complex environments.\n\nThe course outline is:\n1. Introduction to statistical machine learning, feed-forwa

# Retrieve relevant notebooks per evaluation question

In [21]:
retrieval_k = 10

In [22]:
qa_dataset_path = Path('eval_data/evaluation_dataset.csv')
evaluation_df = read_data(qa_dataset_path)
evaluation_df.columns

Index(['Question', 'Answer', 'Source'], dtype='object')

In [23]:
for index, row in tqdm(evaluation_df.iterrows(), total=evaluation_df.shape[0]):
    query = row['Question']
    retrieved_docs = vectorstore.similarity_search_with_score(query, k=retrieval_k)
    for idx, retrieved_doc in enumerate(retrieved_docs):
        idx = idx + 1
        evaluation_df.at[index, 'Retrived_doc_{}_source'.format(idx)] = retrieved_doc[0].metadata['source']
        evaluation_df.at[index, 'Retrived_doc_{}_id'.format(idx)] = int(retrieved_doc[0].metadata['id'])
        evaluation_df.at[index, 'Retrived_doc_{}_content'.format(idx)] = retrieved_doc[0].page_content
        evaluation_df.at[index, 'Retrived_doc_{}_score'.format(idx)] = retrieved_doc[1]


100%|██████████| 20/20 [00:04<00:00,  4.24it/s]


# Evaluating results

In [24]:
def convert_string_to_notebook_id(strings):
    """
    Convert file paths to simplified notebook identifiers.

    Args:
    strings (list of str): List of file paths as strings.

    Returns:
    list of str: List of simplified notebook identifiers.
    """
    file_ids = []
    for string in strings:
        # Split the string by '\\' to get the components
        parts = string.split('\\')[-1]
        # Combine with fix
        if string.endswith('ipynb'):
            notebook_number = parts.split('-')[0]
            if len(notebook_number.split('_'))>1:
                notebook_number = notebook_number.split('_')[0] +"_"+ notebook_number.split('_')[1]
            notebook_number = notebook_number.replace('.', '_')
            file_id = f'notebook {notebook_number}'
        else:
            file_id = parts
        file_ids.append(file_id)

    return file_ids

In [25]:
# apply the function to the retrieved docs
for idx in range(1,retrieval_k+1):
    evaluation_df['Retrived_doc_{}_source'.format(idx)] = convert_string_to_notebook_id(evaluation_df['Retrived_doc_{}_source'.format(idx)])

In [26]:
# find Recall@k
for index, row in tqdm(evaluation_df.iterrows(), total=evaluation_df.shape[0]):
    sources = [source.strip() for source in row['Source'].split(',')]
    correct_retrieved_docs = 0
    for idx in range(1,retrieval_k+1):
        if row['Retrived_doc_{}_source'.format(idx)] in sources:
            correct_retrieved_docs += 1
        evaluation_df.at[index, 'recall@{}'.format(idx)] = correct_retrieved_docs

100%|██████████| 20/20 [00:00<00:00, 2857.25it/s]


In [27]:
# store evaluation_df
evaluation_df_store_path = r"eval_data\retrieval_data\{}_evaluation_dataset.csv".format(vectorstore_name)
evaluation_df.to_csv(evaluation_df_store_path, index=False)

In [28]:
recall_k = {}
for idx in range(1,retrieval_k+1):
    queries_with_no_relevant_docs = int(evaluation_df['recall@{}'.format(idx)].value_counts()[0])
    queries_with_relevant_docs = int(evaluation_df.shape[0] - queries_with_no_relevant_docs)
    recall_k[idx] = round(queries_with_relevant_docs/evaluation_df.shape[0] * 100,2)

In [29]:
result_json = {
    'recall@{}'.format(idx): recall_k[idx] for idx in range(1,retrieval_k+1)
}
recall_k_json_name = r"eval_data\retrieval_recall_results\recall_k_{}.json".format(vectorstore_name)
with open(recall_k_json_name, 'w') as f:
    json.dump(result_json, f)
