In [1]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# process .env file
load_dotenv()

True

In [35]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_QA')

In [26]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [27]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['question', 'answer']]
    data.loc[:,'question'] = data['question'].apply(clean) # applies preprocessing function
    data.loc[:,'answer'] = data['answer'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Question', 'Answer'] # renames column names
    return data

In [28]:
embeddings = OpenAIEmbeddings()

In [29]:
documents = []
rag_dataset = load_dataset("rag-datasets/rag-mini-wikipedia", "question-answer", split='test')
unpreprocessed_dataset = pd.DataFrame(rag_dataset)

unpreprocessed_dataset.head()

Unnamed: 0,question,answer,id
0,Was Abraham Lincoln the sixteenth President of...,yes,0
1,Did Lincoln sign the National Banking Act of 1...,yes,2
2,Did his mother die of pneumonia?,no,4
3,How many long was Lincoln's formal education?,18 months,6
4,When did Lincoln begin his political career?,1832,8


In [30]:
complete_dataset = preprocess(unpreprocessed_dataset)
complete_dataset

Unnamed: 0,Question,Answer
0,Abraham Lincoln sixteenth President United States,yes
1,Lincoln sign National Banking Act 1863,yes
2,mother die pneumonia,
3,many long Lincolns formal education,18 months
4,Lincoln begin political career,1832
...,...,...
913,Wilson president American Political Science As...,Yes
914,cast ballot John Palmer presidential candidate...,Yes
915,Wilson spend 1914 beginning 1917 trying keep A...,Yes
916,Wilson staunch opponent antisemitism sympathet...,Yes


In [31]:
for index, row in complete_dataset.iterrows():
    documents.append(Document(
        page_content=f"{index + 1}. Question: {row['Question']} - Answer: {row['Answer']}"
    ))

In [32]:
documents

[Document(metadata={}, page_content='1. Question: Abraham Lincoln sixteenth President United States - Answer: yes'),
 Document(metadata={}, page_content='2. Question: Lincoln sign National Banking Act 1863 - Answer: yes'),
 Document(metadata={}, page_content='3. Question: mother die pneumonia - Answer: '),
 Document(metadata={}, page_content='4. Question: many long Lincolns formal education - Answer: 18 months'),
 Document(metadata={}, page_content='5. Question: Lincoln begin political career - Answer: 1832'),
 Document(metadata={}, page_content='6. Question: Legal Tender Act 1862 establish - Answer: United States Note first paper currency United States history'),
 Document(metadata={}, page_content='7. Question: suggested Lincoln grow beard - Answer: 11yearold Grace Bedell'),
 Document(metadata={}, page_content='8. Question: Gettysburg address argue America born - Answer: 1776'),
 Document(metadata={}, page_content='9. Question: Lincoln beat John C Breckinridge 1860 election - Answer:

In [33]:
len(documents)

918

In [36]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2846167c160>

In [None]:
complete_dataset.to_csv('../data/evaluation_data.csv')