In [53]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


In [54]:
# process .env file
load_dotenv()

True

In [55]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_CONTEXT')

In [56]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [57]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['passage']]
    data.loc[:,'passage'] = data['passage'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Passage'] # renames column names
    return data

In [58]:
embeddings = OpenAIEmbeddings()

In [59]:
documents = []
rag_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "text-corpus", split='passages')
unpreprocessed_dataset = pd.DataFrame(rag_dataset)

unpreprocessed_dataset.head()

Unnamed: 0,passage,id
0,"Uruguay (official full name in ; pron. , Eas...",0
1,"It is bordered by Brazil to the north, by Arge...",1
2,Montevideo was founded by the Spanish in the e...,2
3,The economy is largely based in agriculture (m...,3
4,"According to Transparency International, Urugu...",4


In [60]:
complete_dataset = preprocess(unpreprocessed_dataset)
complete_dataset

Unnamed: 0,Passage
0,Uruguay official full name pron Eastern Republ...
1,bordered Brazil north Argentina across bank Ur...
2,Montevideo founded Spanish early 18th century ...
3,economy largely based agriculture making 10 GD...
4,According Transparency International Uruguay s...
...,...
3195,2007 duck Tallahassee Florida survived gunshot...
3196,rare genetic mutation sees ducks born four leg...
3197,Moche people ancient Peru worshipped nature Be...
3198,Angel Wing disease common ducks


In [61]:
for index, row in complete_dataset.iterrows():
    documents.append(Document(
        page_content=f"{index + 1}. Passage: {row['Passage']}"
    ))

In [62]:
documents

[Document(metadata={}, page_content='1. Passage: Uruguay official full name pron Eastern Republic Uruguay country located southeastern part South America home 33 million people 17 million live capital Montevideo metropolitan area'),
 Document(metadata={}, page_content='2. Passage: bordered Brazil north Argentina across bank Uruguay River west estuary RÃo de la Plata southwest South Atlantic Ocean southeast second smallest independent country South America larger Suriname French overseas department French Guiana'),
 Document(metadata={}, page_content='3. Passage: Montevideo founded Spanish early 18th century military stronghold Uruguay independence 1828 following threeway struggle Spain Argentina Brazil constitutional democracy president fulfills roles head state head government'),
 Document(metadata={}, page_content='4. Passage: economy largely based agriculture making 10 GDP substantial export statesector relies heavily world trade Consequently badly affected downturn global prices Ho

In [63]:
len(documents)

3200

In [64]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1f449186d10>

In [65]:
complete_dataset.to_csv('../data/context_data.csv')