# Installations
This cells installs the relevant libraries required to run the analysis

In [1]:
!pip install langchain chromadb openai tiktoken lark faiss-cpu



# Imports
Now we import all the required libraries to our notebook. These are essentially functions that will be used throughout our code to run different parts of the analysis

In [2]:
# from google.colab import files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI, Ollama
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import re, os
import openai, lark
from langchain.document_loaders import TextLoader

from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.vectorstores import FAISS

from transformers import XLMTokenizer, XLMWithLMHeadModel, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from torch import Tensor

  from .autonotebook import tqdm as notebook_tqdm


# File Upload
Running this cell will allow you to upload a file from your local system to Colab for further analysis

Click on `Choose Files` and find your file locally

If you have multiple files, hold `ctrl`/`cmd` while selecting 2nd file and onwards

In [None]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving UK_08.txt to UK_08.txt
User uploaded file "UK_08.txt" with length 35668 bytes


# OpenAI API Key
Enter your private OpenAI API Key. Be sure to not make these public

In [4]:
OPENAI_API_KEY = "sk-************************************************"
openai.api_key = OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Set Up
This is where we specify the chunk size (i.e. how much context from the document would go into _chatGPT_ that is similar to your query)

## Define our text splitter

In [3]:
# # Update this for multiple files (Only works on Colab)
# for fn in uploaded.keys():
#     loader = TextLoader(fn)

# loader = TextLoader("./../../data/interim/01-clean/UK_01.txt")
loader = TextLoader("../../data/interim/00-pdf2text/nougat/0.1.0-small/UK_01.mmd")

documents = loader.load()

chunk_size = 2000
chunk_overlap = 50
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
all_splits = text_splitter.split_documents(documents)

# Simplest Implementation

## Multi-lingual Embeddings
From HuggingFace (Instead of OpenAI's)

In [5]:
#### Use Hf for embeddings instead of hkunlp instructor large

input_texts = [all_splits[i].page_content for i in range(len(all_splits))]

### intfloat/multilingual-e5
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

# ## large
# tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
# model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

# ## base
# tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')
# model = AutoModel.from_pretrained('intfloat/multilingual-e5-base')

## small
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-small')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-small')

# ### sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
# model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# embeddings = model.encode(input_texts)


batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings = F.normalize(embeddings, p=2, dim=1)

Downloading model.safetensors: 100%|██████████| 471M/471M [00:53<00:00, 8.83MB/s]


: 

## Hkunlp Embeddings
Instructor-large (Instead of OpenAI's)

In [None]:
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")

## Rest of the pipeline

In [None]:
vectorstore = FAISS.from_documents(all_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [None]:
query = "Create a list of all policy initiatives or programs mentioned in the document"

In [None]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)
qa_chain.run(query)

"1. Action to improve basic skills\n2. Creation of a new system of technical education\n3. Addressing STEM shortages\n4. Identifying and addressing sector-specific skills shortages\n5. High quality careers information, advice, and guidance\n6. Testing new approaches to lifelong learning\n7. Institute for Apprenticeships and Technical Education\n8. Development of an Industrial Strategy\n9. British Council's support for skills development\n10. UK Skills Partnership\n11. Funding mechanisms for TVET\n12. '15 Routes' for technical and vocational education"