In [2]:
import pandas as pd
import faiss
import requests
import numpy as np
import os

Load Index

In [3]:
if os.path.exists('index'):
  index = faiss.read_index('index')
  print("Index Loaded") 
else:
  dim = 3072
  index = faiss.IndexFlatL2(dim)
  print("Index Created")

Index Loaded


Create text representation

In [17]:
def text_rep(row):
  text_rep = f"""Title: {row['Title']}
Publication Date: {row['Publication Date']}
Keywords: {row['Keywords']}
Abstract: {row['Abstract']}
Subject Areas: {row['Subject Areas']}
"""
  return text_rep

In [18]:
datasets = ['./Database-files/2018.csv', './Database-files/2019.csv', './Database-files/2020.csv', './Database-files/2021.csv', './Database-files/2022.csv', './Database-files/2023.csv', './Database-files/final5_arxiv_articles.csv']
df = pd.DataFrame()
for dataset in datasets:
  temp_df = pd.read_csv(dataset)
  df = pd.concat([df, temp_df], ignore_index=True)

df['text_representation'] = df.apply(text_rep, axis=1)

In [19]:
df.head()['text_representation']

0    Title: Recent developments in bifunctional air...
1    Title: The benefit of punishment sensitivity o...
2    Title: Are endocrine disrupting compounds envi...
3    Title: Probabilistic regular grammar inference...
4    Title: Optimization of cinnamon oil nanoemulsi...
Name: text_representation, dtype: object

In [34]:
print(len(df))

17438


Create and Store Embeddings

In [14]:
X = np.empty((len(df['text_representation']), dim), dtype = 'float32')

KeyError: 'text_representation'

In [12]:
def embed(text_rep):
  res = requests.post('http://localhost:11434/api/embeddings', 
                      json = {
                        'model': 'llama3.2',
                        'prompt': text_rep
                      })
  return np.array([res.json()['embedding']], dtype='float32')

In [None]:
import requests

def query_ollama(prompt):
    response = requests.post(
        "http://localhost:11434/api/completions",
        json={"model": "llama3.2", "prompt": prompt}
    )
    response.raise_for_status()  # Ensure no HTTP errors
    return response.json()

print(query_ollama("Hello, how are you?"))

In [57]:
for i, text_rep in enumerate(df['text_representation']):
  if i%100 == 0:
    print(i)
  X[i] = embed(text_rep)
  
index.add(X)

0


KeyError: 'embedding'

In [53]:
faiss.write_index(index, 'index')

Testing

In [39]:
print(df[df.Title.str.contains('bifunctional air electrodes ')]['text_representation'].to_string())

0    Title: Recent developments in bifunctional air...


In [40]:
favorite_paper = df.iloc[0]

In [10]:
test = """Title: Advances in bifunctional electrocatalysts for efficient unitized regenerative proton exchange membrane fuel cells
Publication Date: 2021-08-10
Keywords: Electrocatalysts, Bifunctional electrodes, Oxygen reduction reaction, Oxygen evolution reaction, PEM fuel cells, Hydrogen production, Energy conversion, Electrochemical performance, Catalyst design, Regenerative fuel cells
Abstract: Unitized regenerative proton exchange membrane fuel cells (UR-PEMFCs) have shown promise as a sustainable energy solution for both energy storage and conversion. However, the performance of these systems is heavily dependent on the efficiency of the air electrodes, specifically the bifunctional electrocatalysts that mediate both the oxygen reduction reaction (ORR) and the oxygen evolution reaction (OER). This paper reviews recent developments in bifunctional catalysts for UR-PEMFCs, with a focus on the design and optimization of electrocatalyst materials that enhance both reactions' efficiency. The work highlights the use of non-precious metal catalysts, the role of hybrid materials, and the advances in electrode architectures aimed at improving long-term stability and reducing costs. Recent studies demonstrate that transition metal-based catalysts offer significant potential in improving the round-trip efficiency of UR-PEMFCs, with efficiencies exceeding 60% at moderate current densities.
Subject Areas: Renewable Energy, Energy Conversion and Storage, Fuel Cell Technology, Materials Science, Electrochemistry, Environmental Sustainability
"""
test_authors = ['Jienkulsawad P.', 'Saebea D.', 'Patcharavorachot Y.', 'Kheawhom S.', 'Arpornwichanop A.']

In [5]:
def faiss_search(index, query):
  distances, indices = index.search(embed(query), 10)
  return distances[0], indices[0] #We only have 1 query

In [6]:
def count_authors_matches(list1,list2):
  return sum([author in list1 for author in list2])

In [7]:
def weighted_scores(indices, distances, query_authors, df):
    results = []
    for index, distance in zip(indices, distances):
        candidate_paper = df.iloc[index]
        authors = candidate_paper['Authors']
        matches_amount = count_authors_matches(authors, query_authors)
        similarity_score = (1 - distance/1000)
        weighted_scores = similarity_score * (1.1 ** matches_amount)
        
        results.append((index, candidate_paper['Title'], weighted_scores))
    
    sorted_results = sorted(results, key=lambda x: x[2], reverse=True)
    return sorted_results


In [8]:
def recommend(index, query, query_authors, df):
  distances, indices = faiss_search(index, query)
  best_matches = weighted_scores(indices, distances, query_authors, df)
  best_indices = [x[0] for x in best_matches]
  best_matches_df = df.iloc[best_indices]
  for _,match in best_matches_df.iterrows():
     print(f"{match['text_representation']}Authors: {match['Authors']}\n")

In [20]:
best_matches = recommend(index, test, test_authors, df)

Title: A short proof of Seymour's 6-flow theorem
Publication Date: 7/6/2023
Keywords: nan
Abstract: We give a compact variation of Seymour's proof that every $2$-edge-connected
graph has a nowhere-zero $\mathbb{Z}_2 \times \mathbb{Z}_3$-flow.
Subject Areas: Combinatorics, Mathematics
Authors: Matt DeVos, Kathryn Nurse

Title: A short proof of Seymour's 6-flow theorem
Publication Date: 7/6/2023
Keywords: nan
Abstract: We give a compact variation of Seymour's proof that every $2$-edge-connected
graph has a nowhere-zero $\mathbb{Z}_2 \times \mathbb{Z}_3$-flow.
Subject Areas: Combinatorics, Mathematics
Authors: Matt DeVos, Kathryn Nurse

Title: A short proof of Seymour's 6-flow theorem
Publication Date: 7/6/2023
Keywords: nan
Abstract: We give a compact variation of Seymour's proof that every $2$-edge-connected
graph has a nowhere-zero $\mathbb{Z}_2 \times \mathbb{Z}_3$-flow.
Subject Areas: Combinatorics, Mathematics
Authors: Matt DeVos, Kathryn Nurse

Title: A short proof of Seymour's 6-f