In [1]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# process .env file
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_CONTEXT')

In [4]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [5]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['context']]
    data.loc[:,'context'] = data['context'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Passage'] # renames column names
    return data

In [6]:
embeddings = OpenAIEmbeddings()

In [7]:
documents = []
rag_dataset_train = load_dataset("chloedh0228/rag-dataset-12000", split='train')
rag_dataset_test = load_dataset("chloedh0228/rag-dataset-12000", split='test')
unpreprocessed_dataset = pd.concat([pd.DataFrame(rag_dataset_train), pd.DataFrame(rag_dataset_test)])

unpreprocessed_dataset.head()

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


Unnamed: 0,context,question,answer,extracted_sentences,logical_relationship
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...,['THE RISE and rise of the Australian strawber...,Direct Matching Logic Chain
1,RWSN Collaborations\nSouthern Africa Self-supp...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...,['Benefits reported from having access to Self...,Direct Matching Logic Chain
2,All Android applications categories\nDescripti...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...,"['The first unique feature is Real-Time.', 'Th...",Direct Matching Logic Chain
3,"How unequal is India? The question is simple, ...",What is the main difference between the Nation...,The main difference between the NSS and the IH...,"['For some 60 years, the only reliable informa...",Comparative Reasoning Logic Chain
4,Gunnar Nelson took his time on the feet agains...,How did Gunnar Nelson win the fight against Za...,Gunnar Nelson won the fight against Zak Cummin...,['Gunnar Nelson took his time on the feet agai...,Direct Matching Logic Chain


In [8]:
complete_dataset = preprocess(unpreprocessed_dataset).sample(frac=1, random_state=42).drop_duplicates()
complete_dataset

Unnamed: 0,Passage
1935,MAGAZINE NEWS NEW PRODUCTS TOPICS COLUMNS RESO...
6494,Sacred Drift Earth Pilgrim London looking beau...
1720,David De Gea reassured Spain role Vicente Del ...
9120,Elks Lodge Granite City Ill Gateway Heritage C...
360,youre paying good money wine able taste grapes...
...,...
11964,Diyarbakır Metropolitan Municipality Water Sew...
5191,Global Mapping SAC operating company globalmap...
5390,Latest Mathematical analysis Stories Turbogene...
860,November 3 2010 111 Wow closet looks amazing W...


In [9]:
for index, row in complete_dataset.iterrows():
    documents.append(Document(
        page_content=f"{row['Passage']}"
    ))

In [10]:
documents

[Document(metadata={}, page_content='MAGAZINE NEWS NEW PRODUCTS TOPICS COLUMNS RESOURCES MULTIMEDIA EVENTS DIRECTORIES Ceramic Society Japan Announces Annual MeetingThe Annual Meeting Ceramic Society Japan CSJ held Ohkubo campus Waseda University Tokyo Japan March 2123 2001 scientific program cover structural ceramics electroceramics glass raw materials processing bioceramics materials analysis cements porcelain enamel whitewares education annual banquet Ceramographic Award Exhibition also part meeting information contact CSJ ask_wwwceramicorjp visit Nasdaq Opens Branch IndiaThe USbased Nasdaq stock market specializes technology stocks opened first branch India new Nasdaq office southern city Bangalore countrys information technology hub vice chairman Nasdaq Alfred Berkley said new branch would open window Indian companies access global funds fourth Nasdaq office outside US London Tokyo Brazil Source BBC World Service newsroom Tosoh Renames Nippon Silica Glass USAJapanbased Tosoh Corp 

In [11]:
len(documents)

12000

In [12]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1a23b8c6650>

In [13]:
complete_dataset.to_csv('../../data/context/chloedh0228.csv')