In [1]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# process .env file
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_CONTEXT')

In [4]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [5]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['context']]
    data.loc[:,'context'] = data['context'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Passage'] # renames column names
    return data

In [6]:
embeddings = OpenAIEmbeddings()

In [7]:
documents = []
rag_dataset= load_dataset("lingjoor/databricks-dolly-15k-context-3k-rag", split='train')
unpreprocessed_dataset = pd.DataFrame(rag_dataset)

unpreprocessed_dataset.head()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia Virgin Australia, the trading...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,Elops saurus The ladyfish or tenpounder (Elops...,Tope,classification
2,Why can camels survive for long without water?,Camel Most camels surviving today are domestic...,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",Villikins and his Dinah In Alice's Adventures ...,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Tomoaki Komorida Komorida was born in Kumamoto...,"Tomoaki Komorida was born on July 10,1981.",closed_qa


In [8]:
complete_dataset = preprocess(unpreprocessed_dataset).sample(frac=1, random_state=42).drop_duplicates()
complete_dataset

Unnamed: 0,Passage
2957,Jolina Magdangal discography selftitled album ...
11845,Magnesium Elemental magnesium graywhite lightw...
9230,Bad Bananas Bad Bananas Carbon Footprint Every...
930,Bob Sanders Demond Bob Sanders born February 2...
7671,Pauline Wayne Pauline Wayne Holstein cow belon...
...,...
5191,Presidency Joe Biden Biden altered cabinet str...
13418,Pickleball Pickleball indoor outdoor racketpad...
5390,Henri Barki Small Group Brainstorming Idea Qua...
860,American Alpine Journal American Alpine Journa...


In [9]:
for index, row in complete_dataset.iterrows():
    documents.append(Document(
        page_content=f"{row['Passage']}"
    ))

In [10]:
documents

[Document(metadata={}, page_content='Jolina Magdangal discography selftitled album Jolina allrevival album Memory Lane listed among best selling albums time Philippines Al Coury Coury released best selling albums time soundtracks Saturday Night Fever Flashdance albums Pink Floyds Dark Side Moon Guns N Roses Appetite Destruction earned title Vince Lombardi record business Bintang di Surga Bintang di Surga generally considered one best selling albums time Indonesia 3 million copies sold Al Coury 1973 instrumental release Pink Floyds Dark Side Moon became one best selling albums time one persuaded Pink Floyd take song Money single Money became bands first hit United States Definitely Maybe 2008 poll conducted Q HMV greatest British albums time Definitely Maybe placed 1 Rolling Stone ranked album 217 2020 list 500 Greatest Albums Time 78 2011 list 100 Best Albums Nineties well 42 2013 list 100 Best Debut Albums Time German edition Rolling Stone ranked album 156 list 500 Greatest Albums Tim

In [11]:
len(documents)

14919

In [12]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2bfc01a4880>

In [13]:
complete_dataset.to_csv('../../data/context/lingjoor.csv')