#### This code is to reduce token count by removing columns that have null values for each row as LangChain CSV loader loads still print every columns even though there are empties which also affect the similarity search result

In [1]:
import pandas as pd
data=pd.read_csv('decoded_sacom.csv')

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14378 entries, 0 to 14377
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Org ID               14378 non-null  int64  
 1   Org Name             14378 non-null  object 
 2   AKA                  4861 non-null   object 
 3   Acronym              1375 non-null   object 
 4   Former Name          1486 non-null   object 
 5   S Street Addr 1      12931 non-null  object 
 6   S Street Addr 2      3218 non-null   object 
 7   S Suburb             14303 non-null  object 
 8   S State              14346 non-null  object 
 9   S Postcode           14287 non-null  object 
 10  Phone                10401 non-null  object 
 11  Mobile               5782 non-null   object 
 12  Email                12615 non-null  object 
 13  Website              12137 non-null  object 
 14  Open Hours           6285 non-null   object 
 15  Wheelchair Access    5118 non-null  

#### Extract suburb and council as input for metadata for improved search result

In [3]:
location=data["S Suburb"]
council=data["Council"]
data.drop(["S Street Addr 1", "S Street Addr 2","S Suburb","S State","S Postcode","Council"],axis=1, inplace=True)

#### Function to create strings for each csv row while removing column which have null, and append to a list

In [4]:
def create_row_strings(df):
    row_strings = []  # Initialize an empty list to store row strings

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Initialize an empty list to store column name-value pairs
        column_value_pairs = []

        # Iterate over each column in the row
        for column_name, value in row.items():
            # Check if the value is not null (not NaN)
            if pd.notna(value):
                # Format column name and value as "column_name: value"
                column_value_pair = f"{column_name}: {value}"
                column_value_pairs.append(column_value_pair)

        # Join column name-value pairs with newline separator
        formatted_row_string = "\n".join(column_value_pairs)

        # Append the formatted row string to the list
        row_strings.append(formatted_row_string)

    return row_strings

# Create a list of row strings
strings_list = create_row_strings(data)
len(strings_list)

14378

In [5]:
#check first five results, empty columns for each rows are removed
strings_list[0:5]

['Org ID: 193932\nOrg Name: RSL Ardrossan Sub Branch\nAKA: Ardrossan RSL; Returned & Services League Ardrossan\nPhone: 08 8837 3596\nEmail: ardrossan@rslsa.org.au\nWebsite: http://rslsa.org.au/stores/ardrossan\nServices: Welfare and pensions support for ex-servicemen and their families\r\\\nSocial and recreational activities\r\\\nCommemoration activities - ANZAC Day, Remembrance Day and other significant events\nOrg Type: Business\nLocal Community dir: Service Clubs\nSubjects: Ex-Defence Service Groups; Halls For Hire; Social & Activity Groups; Support & Resource Groups; Veterans\nPrimary Category: Recreation',
 'Org ID: 193933\nOrg Name: RSL Balaklava Sub Branch\nAKA: Balaklava RSL; Returned & Services League Balaklava\nPhone: 08 8100 7300 Main Office\nMobile: 0433 799 950 President\nEmail: balaklava@rslsa.org.au\nWebsite: http://www.rslsa.org.au\nServices: Welfare and pensions support for ex-servicemen and their families\r\\\nSocial and recreational activities\r\\\nCommemoration acti

#### Create a LangChain Document and add the list of strings to the documents for vectordatabase preparation

In [6]:
from langchain_core.documents import Document

doc = []

for i in range(len(strings_list)):
    page=Document(page_content = strings_list[i], metadata = {'location': location[i], 'council': council[i]})
    doc.append(page)


In [7]:
doc[0]

Document(page_content='Org ID: 193932\nOrg Name: RSL Ardrossan Sub Branch\nAKA: Ardrossan RSL; Returned & Services League Ardrossan\nPhone: 08 8837 3596\nEmail: ardrossan@rslsa.org.au\nWebsite: http://rslsa.org.au/stores/ardrossan\nServices: Welfare and pensions support for ex-servicemen and their families\r\\\nSocial and recreational activities\r\\\nCommemoration activities - ANZAC Day, Remembrance Day and other significant events\nOrg Type: Business\nLocal Community dir: Service Clubs\nSubjects: Ex-Defence Service Groups; Halls For Hire; Social & Activity Groups; Support & Resource Groups; Veterans\nPrimary Category: Recreation', metadata={'location': 'Ardrossan', 'council': 'Yorke Peninsula Council'})

#### Create Vector Database and Save to Local Directory (chroma_db)

In [8]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

os.environ["GOOGLE_API_KEY"] = "AIzaSyCnugJnCVtKfUcxNeWMPsa2UkdoSa0hG4Y"
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = Chroma.from_documents(doc, embeddings, persist_directory="./chroma_db")

In [9]:
vectorstore._collection.count()

14378