In [1]:
# Standard library imports
import os
import re
import math
import json
from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LangChain imports
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore, Pinecone
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import OpenAIEmbeddings

# Setup stop words for NLP
stop_words = set(stopwords.words('english'))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# process .env file
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index = os.getenv('PINECONE_QA')

In [4]:
# function for tokenization and special character and stopword removal
def clean(data):
    # regex removes punctuation and special characters
    no_punctuation_and_specials = re.sub(r'[^\w\s]', '', str(data))

    # tokenizing step
    tokens = word_tokenize(str(no_punctuation_and_specials))

    # stopword removal
    filtered_tokens = [str(word) for word in tokens if word.lower() not in stop_words]

    # returns tokenized text in sentence format
    return " ".join(filtered_tokens)

In [5]:
# general function for preprocessing data
def preprocess(data):
    # includes only needed columns
    data = data[['question', 'answer']]
    data.loc[:,'question'] = data['question'].apply(clean) # applies preprocessing function
    data.loc[:,'answer'] = data['answer'].apply(clean) # applies preprocessing function
    data = data.reset_index(drop=True)
    data.columns = ['Question', 'Answer'] # renames column names
    return data

In [6]:
embeddings = OpenAIEmbeddings()

In [7]:
documents = []
rag_dataset = load_dataset("rachid16/rag_finetuning_data", split='train')
unpreprocessed_dataset = pd.DataFrame(rag_dataset)

unpreprocessed_dataset.head()

Unnamed: 0,question,context,answer
0,Sort these into breakfast or dinner foods: Waf...,,"The breakfast foods are waffles, pancakes and ..."
1,Do salicylates dilate blood vessels through in...,Compared with other non-steroid anti-inflammat...,Salicylates dilate blood vessels through inhib...
2,Do perioperative factors determine outcome aft...,There is evidence that postponing surgery in c...,In this cohort of critically ill patients oper...
3,Is rs219780 SNP of Claudin 14 Gene Related to ...,The CLDN14 gene encodes a protein involved in ...,rs219780 SNP of CLDN14 does not appear to be a...
4,Which NFL team has the most Super Bowl champio...,,The Pittsburgh Steelers and the New England Pa...


In [8]:
preprocessed_df = preprocess(unpreprocessed_dataset)
preprocessed_df

Unnamed: 0,Question,Answer
0,Sort breakfast dinner foods Waffles pancakes t...,breakfast foods waffles pancakes bacon eggs di...
1,salicylates dilate blood vessels inhibiting PY...,Salicylates dilate blood vessels inhibiting PY...
2,perioperative factors determine outcome surger...,cohort critically ill patients operated SAP tr...
3,rs219780 SNP Claudin 14 Gene Related Clinical ...,rs219780 SNP CLDN14 appear risk factor develop...
4,NFL team Super Bowl championship wins,Pittsburgh Steelers New England Patriots tied ...
...,...,...
43517,linoleateenriched cheese product reduce lowden...,linoleateenriched cheese product absence chang...
43518,prohibitin overexpressed Huh7HCV Huh75HCV cell...,expression prohibitin relatively high Huh7HCV ...
43519,Name members band Phish,band Phish started Vermont 1980s current membe...
43520,exploration dome Esophageal ultrasound ultraso...,Operators comfortable endobronchial ultrasound...


In [9]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43522 entries, 0 to 43521
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  43522 non-null  object
 1   Answer    43522 non-null  object
dtypes: object(2)
memory usage: 680.2+ KB


In [10]:
training_dataset = preprocessed_df.sample(frac=1, random_state=42).iloc[:10000]
testing_dataset = preprocessed_df.sample(frac=1, random_state=42).iloc[10000:11000]

In [11]:
training_dataset

Unnamed: 0,Question,Answer
32741,insider trading exactly,CEO public company often buy sell stock compan...
8091,Executed Limit Order Imply Spot Price,Think limit orders waiting line first organize...
6224,better rent better buy certain property market,magic answers Housing market conditions local ...
3095,wait days sell ESPP Stock,depends program run company runs program treas...
37241,Please write travel plan Los Angeles,LA second largest city America also one famous...
...,...,...
32445,people like Electronic Dance Music,Electronic Dance Music also known EDM popular ...
32138,corneal stroma endowed significant number resi...,study demonstrates addition known Langerhans c...
26889,periodic rollovers lowperfoming 401k IRA,twocents read plan document Summary Plan Descr...
40713,Write review Brandon Sandersons book Warbreaker,Cant believe consistent hitter Sanderson chara...


In [12]:
for index, row in training_dataset.iterrows():
    documents.append(Document(
        page_content=f"Question: {row['Question']} - Answer: {row['Answer']}"
    ))

In [13]:
testing_dataset

Unnamed: 0,Question,Answer
29144,Classify following beverages typically served ...,Carbonated soda beer Noncarbonated milk orange...
6666,prove savings without giving account number,Giving bank account number generally security ...
4262,3 apples Jack 4 pears Jill 5 books ate 5 Jacks...,Jack 4 pears beginning less 5 cant really ate ...
8969,Extract list names characters player control L...,Last Us player control Joel Ellie escorted Joe...
35475,like live Miami someone moving California,First youll notice heat Miami hot humid year r...
...,...,...
2834,sky blue,sky appears blue way sunlight scatters atmosph...
11192,preventive health checkup claimed separate exp...,Deduction Health Checkup allowed Section 80D a...
35204,plasma sitosterol elevations associated increa...,Elevations sitosterol concentrations sitostero...
31542,Whats easiest way make friends traveling alone,Traveling alone anywhere world intimidating ca...


In [14]:
for index, row in testing_dataset.iterrows():
    documents.append(Document(
        page_content=f"Question: {row['Question']} - Answer: {row['Answer']}"
    ))

In [15]:
documents

 Document(metadata={}, page_content='Question: Executed Limit Order Imply Spot Price - Answer: Think limit orders waiting line first organized price time order placed earlier orders closer front line order buy order trade must limit orders 1001 higher sellers order would matched instead order filled price 1000 even millisecond trade 1000 even though price might go right back trade'),
 Document(metadata={}, page_content='Question: better rent better buy certain property market - Answer: magic answers Housing market conditions local market vary think impact cash flow best way evaluate housing prices general consider cheap home cost 20 less income affordable 2030 affordable 30 start comparing rent vs buy factors need think Renting easy transaction Youre comparing prices market usually pretty stable risk liability low cost low risk virtually prospects recouping value cash laying home Buying complex Youre buying house building equity probably making money due appreciation need vigilant expe

In [16]:
# insert splits into Pinecone vector database as embeddings
PineconeVectorStore.from_documents(documents, embeddings, index_name=pc_index)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1d5c4b842e0>

In [18]:
training_dataset.to_csv('../../data/training/rachid16_training_data.csv')
testing_dataset.to_csv('../../data/validation/rachid16_evaluation_data.csv')