In [1]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# process .env file
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_QA')

In [4]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [5]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['question', 'answer']]
    data.loc[:,'question'] = data['question'].apply(clean) # applies preprocessing function
    data.loc[:,'answer'] = data['answer'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Question', 'Answer'] # renames column names
    return data

In [6]:
embeddings = OpenAIEmbeddings()

In [7]:
documents = []
rag_dataset_train = load_dataset("chloedh0228/rag-dataset-12000", split='train')
rag_dataset_test = load_dataset("chloedh0228/rag-dataset-12000", split='test')
unpreprocessed_dataset = pd.concat([pd.DataFrame(rag_dataset_train), pd.DataFrame(rag_dataset_test)])

unpreprocessed_dataset.head()

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


Unnamed: 0,context,question,answer,extracted_sentences,logical_relationship
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...,['THE RISE and rise of the Australian strawber...,Direct Matching Logic Chain
1,RWSN Collaborations\nSouthern Africa Self-supp...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...,['Benefits reported from having access to Self...,Direct Matching Logic Chain
2,All Android applications categories\nDescripti...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...,"['The first unique feature is Real-Time.', 'Th...",Direct Matching Logic Chain
3,"How unequal is India? The question is simple, ...",What is the main difference between the Nation...,The main difference between the NSS and the IH...,"['For some 60 years, the only reliable informa...",Comparative Reasoning Logic Chain
4,Gunnar Nelson took his time on the feet agains...,How did Gunnar Nelson win the fight against Za...,Gunnar Nelson won the fight against Zak Cummin...,['Gunnar Nelson took his time on the feet agai...,Direct Matching Logic Chain


In [8]:
preprocessed_df = preprocess(unpreprocessed_dataset)
preprocessed_df

Unnamed: 0,Question,Answer
0,Berry Export Summary 2028 purpose,Berry Export Summary 2028 dedicated export pla...
1,benefits reported access Selfsupply water sources,Benefits reported access Selfsupply water sour...
2,unique features Coolands Twitter app,unique features Coolands Twitter app include R...
3,main difference National Sample Survey NSS Ind...,main difference NSS IHDS terms measuring India...
4,Gunnar Nelson win fight Zak Cummings UFC Fight...,Gunnar Nelson fight Zak Cummings UFC Fight Nig...
...,...,...
11995,achievements Fuzzy Zoeller field golf,Fuzzy Zoeller known golfing success winning te...
11996,Malin Nilsson marry 2 June 2018,Malin Nilsson got married partner German Steff...
11997,Fellowship Law Librarianship offered Cracchiol...,Fellowship Law Librarianship program lawyers s...
11998,second physical eMAG store opened,second physical eMAG store opened Mammut Shopp...


In [9]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  12000 non-null  object
 1   Answer    12000 non-null  object
dtypes: object(2)
memory usage: 187.6+ KB


In [10]:
training_dataset = preprocessed_df.sample(frac=1, random_state=42).iloc[:10000]
testing_dataset = preprocessed_df.sample(frac=1, random_state=42).iloc[10000:11000]

In [11]:
training_dataset

Unnamed: 0,Question,Answer
1935,Annual Meeting Ceramic Society Japan held,Annual Meeting Ceramic Society Japan held Ohku...
6494,title Satish Kumars new book mentioned context,title Satish Kumars new book mentioned context...
1720,three goalkeepers Vicente Del Bosque decided r...,three goalkeepers Vicente Del Bosque decided r...
9120,Gateway Heritage Championship Elks Lodge Grani...,Gary Jay Gateway Heritage Championship
360,cohosted private dinner La Dolce Vita Rich Frank,Al Uzielli Dave Simmer cohosted event Rich Frank
...,...,...
7848,benefits Australian agribusinesses participati...,benefits include gaining comprehensive underst...
5704,director movie Thikka,movie Thikka directed Sunil Reddy
11567,primary targets mysterious social media campai...,Kamala Harris Elizabeth Warren Bernie Sanders ...
8133,tips mentioned context help feeling overwhelmed,tips mentioned context include breathing exerc...


In [12]:
for index, row in training_dataset.iterrows():
    documents.append(Document(
        page_content=f"Question: {row['Question']} - Answer: {row['Answer']}"
    ))

In [13]:
testing_dataset

Unnamed: 0,Question,Answer
5105,projected growth rate global economy 2014 acco...,projected growth rate global economy 2014 acco...
10666,terminated relationships Paula Deen scandalous...,Food Network Smithfield terminated relationshi...
3291,changes Macaire King propose education system ...,Macaire King help local Sen Greg Steube propos...
8125,authors perspective aging time,author views aging time constant inevitable pr...
9519,educational qualifications Father Michael Ramos,Father Michael Ramos holds doctorate education...
...,...,...
7489,award player receive sportsmanship 201314,player received 2014 ITACissie Leary Award Spo...
5,features Fabiana Filippis shirts blouses,Fabiana Filippis shirts blouses easily matched...
5342,qualifications required Director Care Center p...,qualifications required Director Care Center p...
9111,third rider joining Pedercini team Phillip Island,third rider joining Pedercini team Phillip Isl...


In [14]:
for index, row in testing_dataset.iterrows():
    documents.append(Document(
        page_content=f"Question: {row['Question']} - Answer: {row['Answer']}"
    ))

In [15]:
documents

[Document(metadata={}, page_content='Question: Annual Meeting Ceramic Society Japan held - Answer: Annual Meeting Ceramic Society Japan held Ohkubo campus Waseda University Tokyo Japan March 2123 2001'),
 Document(metadata={}, page_content='Question: title Satish Kumars new book mentioned context - Answer: title Satish Kumars new book mentioned context Earth Pilgrim'),
 Document(metadata={}, page_content='Question: three goalkeepers Vicente Del Bosque decided retain Spanish national team - Answer: three goalkeepers Vicente Del Bosque decided retain Spanish national team Pepe Reina Victor Valdes Iker Casillas'),
 Document(metadata={}, page_content='Question: Gateway Heritage Championship Elks Lodge Granite City Ill - Answer: Gary Jay Gateway Heritage Championship'),
 Document(metadata={}, page_content='Question: cohosted private dinner La Dolce Vita Rich Frank - Answer: Al Uzielli Dave Simmer cohosted event Rich Frank'),
 Document(metadata={}, page_content='Question: preferred qualifica

In [16]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x20bdb29eb60>

In [14]:
training_dataset.to_csv('../../data/training/chloedh0228_training_data.csv')
testing_dataset.to_csv('../../data/validation/chloedh0228_evaluation_data.csv')