In [1]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# process .env file
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_QA')

In [4]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [5]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['instruction', 'response']]
    data.loc[:,'instruction'] = data['instruction'].apply(clean) # applies preprocessing function
    data.loc[:,'response'] = data['response'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Question', 'Answer'] # renames column names
    return data

In [6]:
embeddings = OpenAIEmbeddings()

In [7]:
documents = []
rag_dataset= load_dataset("lingjoor/databricks-dolly-15k-context-3k-rag", split='train')
unpreprocessed_dataset = pd.DataFrame(rag_dataset)

unpreprocessed_dataset.head()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia Virgin Australia, the trading...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,Elops saurus The ladyfish or tenpounder (Elops...,Tope,classification
2,Why can camels survive for long without water?,Camel Most camels surviving today are domestic...,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",Villikins and his Dinah In Alice's Adventures ...,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Tomoaki Komorida Komorida was born in Kumamoto...,"Tomoaki Komorida was born on July 10,1981.",closed_qa


In [8]:
preprocessed_df = preprocess(unpreprocessed_dataset)
preprocessed_df

Unnamed: 0,Question,Answer
0,Virgin Australia start operating,Virgin Australia commenced services 31 August ...
1,species fish Tope Rope,Tope
2,camels survive long without water,Camels use fat humps keep filled energy hydrat...
3,Alices parents three daughters Amy Jessy whats...,name third daughter Alice
4,Tomoaki Komorida born,Tomoaki Komorida born July 101981
...,...,...
15006,accept change,Embrace change see difference
15007,laser created,laser device emits light electromagnetic radia...
15008,difference road bike mountain bike,Road bikes built ridden asphalt cement surface...
15009,GIS help real estate investment industry,Real estate investors depend precise accurate ...


In [9]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15011 entries, 0 to 15010
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  15011 non-null  object
 1   Answer    15011 non-null  object
dtypes: object(2)
memory usage: 234.7+ KB


In [10]:
training_dataset = preprocessed_df.sample(frac=1, random_state=42).iloc[:10000]
testing_dataset = preprocessed_df.sample(frac=1, random_state=42).iloc[10000:11000]

In [11]:
training_dataset

Unnamed: 0,Question,Answer
2957,best selling albums time Give list name artist...,Best selling albums time Michael Jackson Thril...
11845,Given paragraph magnesium melting point magnes...,melting point magnesium 650 C
9230,pick best bananas,depends want eat want eat immediately look one...
930,Extract teams Bob Sanders played throughout ca...,Iowa Hawkeyes Indianapolis Colts San Diego Cha...
7671,27th president United States America,William Howard Taft 27th president United Stat...
...,...,...
1505,many miles per year average person drive,13500
10365,Given paragraph Subaru Outback year Wilderness...,2022
670,Tell story sentences food short words make som...,Driving past Farmers Market Café Al Suffolk En...
9471,Tell eight planets solar system moons Classify...,Moon Mercury Venus One Moon Earth Mars Multipl...


In [12]:
for index, row in training_dataset.iterrows():
    documents.append(Document(
        page_content=f"Question: {row['Question']} - Answer: {row['Answer']}"
    ))

In [13]:
testing_dataset

Unnamed: 0,Question,Answer
897,Write response veteran interested renting house,Hello Thank inquiry nice meet Wed love keep ho...
3306,evergreen garden,order evergreen garden crucial buy right plant...
8026,select tennis racket Im completely new sport,youre beginner tennis player pick racket help ...
6239,country highest life expectancy,Life expectancy humans doubled last century th...
2890,could Sunday,people work Sundays free pursue leisure activi...
...,...,...
1137,Tom Billeter,Tom Billeter born February 12 1961 American co...
2767,pros cons charging Tesla home,convenient way charge EV charge home overnight...
8182,reasons people like visit Brazil,Brazil South American country diverse landscap...
14617,species fish Tetra Quart,Tetra


In [14]:
for index, row in testing_dataset.iterrows():
    documents.append(Document(
        page_content=f"Question: {row['Question']} - Answer: {row['Answer']}"
    ))

In [15]:
documents

[Document(metadata={}, page_content='Question: best selling albums time Give list name artist name album - Answer: Best selling albums time Michael Jackson Thriller Eagles Greatest Hits 19711975 Pink Floyd Dark Side Moon Whitney Houston various artists Bodyguard Bee Gees Various artists Saturday Night Fever Fleetwood Mac Rumours Eagles Hotel California Shania Twain Come Meat Loaf Bat Hell ACDC Back Black'),
 Document(metadata={}, page_content='Question: Given paragraph magnesium melting point magnesium celsius - Answer: melting point magnesium 650 C'),
 Document(metadata={}, page_content='Question: pick best bananas - Answer: depends want eat want eat immediately look ones almost entirely yellow waiting days 1 inch less green top expecting last week select ones half green'),
 Document(metadata={}, page_content='Question: Extract teams Bob Sanders played throughout career put commaseparated list - Answer: Iowa Hawkeyes Indianapolis Colts San Diego Chargers'),
 Document(metadata={}, page

In [16]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x246dff78e80>

In [9]:
training_dataset.to_csv('../../data/training/lingjoor_training_data.csv')
testing_dataset.to_csv('../../data/validation/lingjoor_evaluation_data.csv')