#### This code is to reduce token count by removing columns that have null values for each row as LangChain CSV loader loads still print every columns even though there are empties which also affect the similarity search result

In [10]:
import pandas as pd
data=pd.read_csv('decoded_sacom.csv')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14378 entries, 0 to 14377
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Org ID               14378 non-null  int64  
 1   Org Name             14378 non-null  object 
 2   AKA                  4861 non-null   object 
 3   Acronym              1375 non-null   object 
 4   Former Name          1486 non-null   object 
 5   S Street Addr 1      12931 non-null  object 
 6   S Street Addr 2      3218 non-null   object 
 7   S Suburb             14303 non-null  object 
 8   S State              14346 non-null  object 
 9   S Postcode           14287 non-null  object 
 10  Phone                10401 non-null  object 
 11  Mobile               5782 non-null   object 
 12  Email                12615 non-null  object 
 13  Website              12137 non-null  object 
 14  Open Hours           6285 non-null   object 
 15  Wheelchair Access    5118 non-null  

#### Split the services and subjects for improved search result
###### when the content is too long, the similarity score will be lower even though it contains the service

In [12]:
service = data.drop(["S Street Addr 1","S Street Addr 2","S State","S Postcode",'Phone','Mobile', 'Email', 'Website', 'Open Hours', 'Wheelchair Access','Toilets Access', 'Disabled Parking','Subjects'],axis=1)
subject = data.drop(["S Street Addr 1","S Street Addr 2","S State","S Postcode",'Phone','Mobile', 'Email', 'Website', 'Open Hours', 'Wheelchair Access','Toilets Access', 'Disabled Parking','Services'],axis=1)

In [13]:
#split the services to new rows(to improve similarity score)
service['Services'] = service['Services'].str.split('\n')
service = service.explode('Services')
len(service)

48424

In [14]:
#split the subject to new rows (to improve similarity score)
subject['Subjects'] = subject['Subjects'].str.split(';')
subject = subject.explode('Subjects')
len(subject)

50732

In [15]:
#combine both dataframes
combined_df = pd.concat([service, subject])

In [16]:
len(combined_df)

99156

In [17]:
combined_df.head()

Unnamed: 0,Org ID,Org Name,AKA,Acronym,Former Name,S Suburb,Services,Org Type,Local Community dir,Adelaide Hills dir,Onkaparinga dir,Primary Category,Council,Subjects
0,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Ardrossan,Welfare and pensions support for ex-servicemen...,Business,Service Clubs,,,Recreation,Yorke Peninsula Council,
0,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Ardrossan,Social and recreational activities,Business,Service Clubs,,,Recreation,Yorke Peninsula Council,
0,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Ardrossan,"Commemoration activities - ANZAC Day, Remembra...",Business,Service Clubs,,,Recreation,Yorke Peninsula Council,
1,193933,RSL Balaklava Sub Branch,Balaklava RSL; Returned & Services League Bala...,,,Balaklava,Welfare and pensions support for ex-servicemen...,Community,Support Groups,,,Personal & Family Support,Wakefield Regional Council,
1,193933,RSL Balaklava Sub Branch,Balaklava RSL; Returned & Services League Bala...,,,Balaklava,Social and recreational activities,Community,Support Groups,,,Personal & Family Support,Wakefield Regional Council,


In [18]:
combined_df = combined_df.reset_index(drop=True) #reset the index as when splitting and combined, the index are jumbled up
combined_df.head()

Unnamed: 0,Org ID,Org Name,AKA,Acronym,Former Name,S Suburb,Services,Org Type,Local Community dir,Adelaide Hills dir,Onkaparinga dir,Primary Category,Council,Subjects
0,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Ardrossan,Welfare and pensions support for ex-servicemen...,Business,Service Clubs,,,Recreation,Yorke Peninsula Council,
1,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Ardrossan,Social and recreational activities,Business,Service Clubs,,,Recreation,Yorke Peninsula Council,
2,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Ardrossan,"Commemoration activities - ANZAC Day, Remembra...",Business,Service Clubs,,,Recreation,Yorke Peninsula Council,
3,193933,RSL Balaklava Sub Branch,Balaklava RSL; Returned & Services League Bala...,,,Balaklava,Welfare and pensions support for ex-servicemen...,Community,Support Groups,,,Personal & Family Support,Wakefield Regional Council,
4,193933,RSL Balaklava Sub Branch,Balaklava RSL; Returned & Services League Bala...,,,Balaklava,Social and recreational activities,Community,Support Groups,,,Personal & Family Support,Wakefield Regional Council,


#### Extract the location and council for metadata input during vector database create to improve search using metadata filtering

In [19]:
location=combined_df["S Suburb"].str.lower() #extract the location and council for metadata
council=combined_df["Council"].str.lower()
combined_df.drop(['S Suburb','Council'],axis=1,inplace=True)
combined_df.head()

Unnamed: 0,Org ID,Org Name,AKA,Acronym,Former Name,Services,Org Type,Local Community dir,Adelaide Hills dir,Onkaparinga dir,Primary Category,Subjects
0,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Welfare and pensions support for ex-servicemen...,Business,Service Clubs,,,Recreation,
1,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,Social and recreational activities,Business,Service Clubs,,,Recreation,
2,193932,RSL Ardrossan Sub Branch,Ardrossan RSL; Returned & Services League Ardr...,,,"Commemoration activities - ANZAC Day, Remembra...",Business,Service Clubs,,,Recreation,
3,193933,RSL Balaklava Sub Branch,Balaklava RSL; Returned & Services League Bala...,,,Welfare and pensions support for ex-servicemen...,Community,Support Groups,,,Personal & Family Support,
4,193933,RSL Balaklava Sub Branch,Balaklava RSL; Returned & Services League Bala...,,,Social and recreational activities,Community,Support Groups,,,Personal & Family Support,


#### Function to create strings for each csv row while removing column which have null, and append to a list

In [20]:
def create_row_strings(df):
    row_strings = []  # Initialize an empty list to store row strings
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Initialize an empty list to store column name-value pairs
        column_value_pairs = []

        # Iterate over each column in the row
        for column_name, value in row.items():
            # Check if the value is not null (not NaN)
            if pd.notna(value):
                # Format column name and value as "column_name: value"
                column_value_pair = f"{column_name}: {value}"
                column_value_pairs.append(column_value_pair)

        # Join column name-value pairs with newline separator
        formatted_row_string = " \n".join(column_value_pairs)

        # Append the formatted row string to the list
        row_strings.append(formatted_row_string)

    return row_strings

# Create a list of row strings
strings_list = create_row_strings(combined_df)
len(strings_list)

99156

In [21]:
#check first five results, empty columns for each rows are removed
strings_list[100:105]

['Org ID: 193966 \nOrg Name: RSL Meningie Sub Branch \nAKA: Meningie RSL; Returned & Services League Meningie \nServices: Commemoration activities - ANZAC Day, Remembrance Day and other significant events \nOrg Type: Community \nLocal Community dir: Service Clubs \nPrimary Category: Community Organisation & Development',
 'Org ID: 193966 \nOrg Name: RSL Meningie Sub Branch \nAKA: Meningie RSL; Returned & Services League Meningie \nServices: Hall for hire \nOrg Type: Community \nLocal Community dir: Service Clubs \nPrimary Category: Community Organisation & Development',
 'Org ID: 193968 \nOrg Name: RSL Moonta Sub Branch \nAKA: Moonta RSL; Returned & Services League Moonta \nServices: Welfare and pensions support for ex-servicemen and their families \nOrg Type: Community \nLocal Community dir: Ex Services groups \nPrimary Category: Recreation',
 'Org ID: 193968 \nOrg Name: RSL Moonta Sub Branch \nAKA: Moonta RSL; Returned & Services League Moonta \nServices: Social and recreational acti

#### Create a LangChain Document and add the list of strings to the documents for vectordatabase preparation

In [22]:
from langchain_core.documents import Document

doc = []

for i in range(len(strings_list)):
    page=Document(page_content = strings_list[i], metadata = {'location': location[i], 'council': council[i]})
    doc.append(page)


In [16]:
doc[12086]

Document(page_content='Org ID: 199864 \nOrg Name: Scouts SA - Stradbroke \nServices: Recreation \nOrg Type: Community \nPrimary Category: Recreation', metadata={'location': 'rostrevor', 'council': 'adelaide hills council'})

In [17]:
print(doc[11].page_content)

Org ID: 193936 
Org Name: RSL Blanchetown Sub Branch 
AKA: Blanchetown RSL; Returned & Services League Blanchetown 
Services: Social and recreational activities 
Org Type: Community 
Local Community dir: Service Clubs 
Primary Category: Community Organisation & Development


#### Create Vector Database and Save to Local Directory (chroma_db) (old ways)

In [15]:
# import os
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# from langchain_chroma import Chroma

# os.environ["GOOGLE_API_KEY"] = "insert your api key here"
# embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# vectorstore = Chroma.from_documents(doc, embeddings, persist_directory="./chroma_db")
# vectorstore._collection.count()

#### Create Chroma Vector DB with HuggingFace Bert Embeddings

In [6]:
from langchain.embeddings import SentenceTransformerEmbeddings

In [8]:
embeddings=SentenceTransformerEmbeddings(model_name="WhereIsAI/UAE-Large-V1")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/65.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [25]:
query_result = embeddings.embed_query("quit smoking")
query_result[:3]

[-0.10548216849565506, -0.06547529995441437, -0.2753311097621918]

In [None]:
from langchain_chroma import Chroma

bert = Chroma.from_documents(doc, embeddings, persist_directory="./bert_db")