In [3]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


In [4]:
# process .env file
load_dotenv()

True

In [5]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_CONTEXT')

In [6]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [7]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['context']]
    data.loc[:,'context'] = data['context'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Passage'] # renames column names
    return data

In [8]:
embeddings = OpenAIEmbeddings()

In [28]:
documents = []
rag_dataset = load_dataset("rachid16/rag_finetuning_data", split='train')
unpreprocessed_dataset = pd.DataFrame(rag_dataset)

unpreprocessed_dataset.head()

Unnamed: 0,question,context,answer
0,Sort these into breakfast or dinner foods: Waf...,,"The breakfast foods are waffles, pancakes and ..."
1,Do salicylates dilate blood vessels through in...,Compared with other non-steroid anti-inflammat...,Salicylates dilate blood vessels through inhib...
2,Do perioperative factors determine outcome aft...,There is evidence that postponing surgery in c...,In this cohort of critically ill patients oper...
3,Is rs219780 SNP of Claudin 14 Gene Related to ...,The CLDN14 gene encodes a protein involved in ...,rs219780 SNP of CLDN14 does not appear to be a...
4,Which NFL team has the most Super Bowl champio...,,The Pittsburgh Steelers and the New England Pa...


In [29]:
complete_dataset = preprocess(unpreprocessed_dataset).sample(frac=1, random_state=42).drop_duplicates()
complete_dataset

Unnamed: 0,Passage
32741,
16062,investigate relationship tissue elasticity ant...
24781,Recent studies revealed microRNAs miRNAs invol...
22461,Purdue University football team traces origin ...
19831,Injuries runners common However many potential...
...,...
2433,elucidate retinal dysfunction molecular basis ...
16023,ability Yersinia pestis form biofilm important...
21962,investigate efficacy safety acarbose addon the...
11284,Ernest George Ernie Wilson 18 October 1900 7 J...


In [38]:
text_splitter = SemanticChunker(embeddings)
for index, row in complete_dataset.iterrows():
    # docs = text_splitter.create_documents([row['Passage']])
    # documents.extend(docs)
    documents.append(Document(
        page_content=f"{row['Passage']}"
    ))

In [67]:
len(documents)

18097

In [68]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2cd339ffdf0>

In [69]:
complete_dataset.to_csv('../../data/context/rachid16.csv')