### AI-Tools Matcher with Langchain

Load env variables

In [6]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

Load Softwaregini-CSV into Pinecone Vector Store

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
import pandas as pd
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone as PineconeVectorStore
from pinecone import Pinecone

# Lade Umgebungsvariablen aus der .env-Datei
load_dotenv(find_dotenv())

# CSV-Datei einlesen
df = pd.read_csv('tools_clean.csv', delimiter=';')[['name', 'description']]
-
# Pinecone-Instanz initialisieren
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY")
)

# Index abrufen mit explizitem Host
index_name = "tools-matcher"
host = "https://tools-matcher-uow16a3.svc.aped-4627-874a.pinecone.io"

index = pc.Index(
    name=index_name,
    host=host
)

# Embeddings erstellen mit direkter Übergabe des API-Keys
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

# Dokumente vorbereiten
texts = df['description'].tolist()
metadatas = df.apply(lambda x: {
    'name': x['name']
}, axis=1).tolist()

index.delete(delete_all=True)

# Vectorstore erstellen
vectorstore = PineconeVectorStore.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas,
    index_name=index_name
)

print("Daten erfolgreich in Pinecone gespeichert.")


MaxRetryError: HTTPSConnectionPool(host='tools-matcher-uow16a3.svc.aped-4627-874a.pinecone.io', port=443): Max retries exceeded with url: /vectors/delete (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000021FB336B250>: Failed to resolve 'tools-matcher-uow16a3.svc.aped-4627-874a.pinecone.io' ([Errno 11001] getaddrinfo failed)"))

LLM interaction with batching


In [3]:
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
import pandas as pd
import os
from tqdm import tqdm # progress bar

# 1. Lade die Human-Datei
df_human = pd.read_csv('../softwaregini_files/organization_tools.csv', delimiter=';')

# 2. Erstelle einen effizienten Prompt
prompt_template = PromptTemplate(
    input_variables=["human_tool", "human_desc", "matched_tool", "matched_desc"],
    template="""
    Vergleiche diese zwei Software-Tools und gib einen Similarity Score von 0-100 zurück:

    Tool 1:
    {human_desc}

    Tool 2:
    {matched_desc}

    Antworte nur mit einer Zahl von 0-100, die die Ähnlichkeit repräsentiert.
    """
)

# 3. LLM initialisieren
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo", api_key=os.getenv("OPENAI_API_KEY"))

# Preise pro 1K Tokens (Stand März 2024):
# GPT-4: $0.03
# GPT-3.5-turbo: $0.0010
 
# Cutoff
MIN_SCORE = 80

# Number of prompts per API-Call
BATCH_SIZE = 50

# matching funktion mit batching
def process_batch(batch_rows, vectorstore, min_score, batch_size):  # Parameter werden übergeben
    batch_results = []
    prompts = []
    
    for _, row in batch_rows.iterrows():
        description = str(row['description']) if pd.notna(row['description']) else ""
        name = str(row['name']) if pd.notna(row['name']) else ""
        human_text = f"Name: {name}\nBeschreibung: {description}"
        
        similar_docs = vectorstore.similarity_search(
            human_text,
            k=2
        )
        
        for doc in similar_docs:
            formatted_prompt = prompt_template.format(
                human_tool=name,
                human_desc=human_text,
                matched_tool=doc.metadata['name'],
                matched_desc=doc.page_content
            )
            prompts.append({
                'prompt': formatted_prompt,
                'metadata': {
                    'human_name': name,
                    'matched_tool': doc.metadata['name']
                }
            })
    
    # Batch Processing
    batch_texts = [p['prompt'] for p in prompts]
    all_responses = [llm.invoke(text).content for text in batch_texts]
    
    # Verarbeite Responses
    for prompt_data, score in zip(prompts, all_responses):
        try:
            score_value = float(score)
            if score_value >= min_score:  # Verwendet übergebenen min_score
                batch_results.append({
                    'human_name': prompt_data['metadata']['human_name'],
                    'tool_name': prompt_data['metadata']['matched_tool'],
                    'similarity_score': score_value
                })
        except ValueError:
            continue
    
    return batch_results

# Hauptverarbeitung
results = []

for i in tqdm(range(0, len(df_human), BATCH_SIZE)):
    batch = df_human.iloc[i:i+BATCH_SIZE]
    batch_results = process_batch(
        batch_rows=batch, 
        vectorstore=vectorstore, 
        min_score=MIN_SCORE,  # Übergabe der Konstante
        batch_size=BATCH_SIZE
    )
    results.extend(batch_results)


# Ergebnisse in DataFrame konvertieren
df_results = pd.DataFrame(results)

# Nach Similarity Score sortieren
df_results = df_results.sort_values('similarity_score', ascending=False)

# Als CSV speichern
df_results.to_csv('matching_results.csv', index=False, sep=';')

100%|██████████| 7/7 [10:00<00:00, 85.75s/it]


LLM interaction with batching and async threading

In [2]:
from langchain.prompts import PromptTemplate
import pandas as pd
import os
from tqdm import tqdm # progress bar
from concurrent.futures import ThreadPoolExecutor
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone

# Pinecone Client initialisieren
pc = Pinecone(
    api_key=os.getenv("PINECONE_API_KEY")
)

# Existierenden Index laden
index_name = "tools-matcher"
index = pc.Index(index_name)

# Vectorstore aus existierendem Index initialisieren
embeddings = OpenAIEmbeddings()
vectorstore = PineconeVectorStore(
    index=index,
    embedding=embeddings,
    text_key="text"  # der Name des Textfelds in Ihrem Index
)

# 1. Lade die Human-Datei
df_human = pd.read_csv('../softwaregini_files/organization_tools.csv', delimiter=';')
# df_human = pd.read_csv('../softwaregini_files/organization_tools_2.csv', delimiter=';')

# prompt_template = PromptTemplate(
#     input_variables=["human_tool", "human_desc", "matched_tool", "matched_desc"],
#     template="""Compare these software tools and determine if they are the same product.

#     Tool 1:
#     Name: {human_tool}
#     Description: {human_desc}

#     Tool 2:
#     Name: {matched_tool}
#     Description: {matched_desc}

#     Scoring rules:
#     100: Exact same product (e.g., 'GitHub Desktop' = 'GitHub')
#     95: Same product, different editions (e.g., 'Teams' = 'Microsoft Teams')
#     90: Product variants (e.g., 'SendInBlue' = 'Brevo')
#     0: Different products, even if similar purpose

#     DO NOT match:
#     - Different messaging apps (e.g., 'WhatsApp' ≠ 'Signal')
#     - Different IDEs/Languages (e.g., 'Python' ≠ 'PyCharm')
#     - Different PDF tools (e.g., 'PDFFactory' ≠ 'PDFCreator')
#     - Different meeting tools (e.g., 'GoToMeeting' ≠ 'GoToAssist')
#     - Different visualization tools (e.g., 'Inkscape' ≠ 'Camtasia')

#     Respond ONLY with a number between 0-100."""
# )

prompt_template = PromptTemplate(
    input_variables=["human_tool", "human_desc", "matched_tool", "matched_desc"],
    template="""Compare these software tools and determine if they are the same product.

    Tool 1:
    Name: {human_tool}
    Description: {human_desc}

    Tool 2:
    Name: {matched_tool}
    Description: {matched_desc}

    Scoring rules:
    100: Exact same product (identical names or official variants)
    95: Same product with minor name differences (e.g., full name vs. short name)
    90: Same product in different editions/versions
    0: Different products, even if similar purpose or from same company

    DO NOT match:
    - Different products from same company
    - Free vs. Pro versions as different products
    - Similar tools with different core purposes
    - Platform vs. specific service variants

    Respond ONLY with a number between 0-100."""
)

# LLM init
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo", api_key=os.getenv("OPENAI_API_KEY"))

# Cutoff
MIN_SCORE = 90

# API-Calls pro Batch = BATCH_SIZE * k
# Gesamtzahl API-Calls = len(df_human) / BATCH_SIZE * k
BATCH_SIZE = 50

# Optimierte Batch-Verarbeitung
def process_batch(batch_rows, vectorstore, min_score, batch_size):
    batch_results = []
    prompts = []
    
    # Prompt-Vorbereitung
    for _, row in batch_rows.iterrows():
        description = str(row['description']) if pd.notna(row['description']) else ""
        name = str(row['name']) if pd.notna(row['name']) else ""
        human_text = f"Name: {name}\nBeschreibung: {description}"
        
        similar_docs = vectorstore.similarity_search(
            human_text,
            k=1 # Anzahl der Tools die maximal gematched werden sollen
        )
        
        for doc in similar_docs:
            formatted_prompt = prompt_template.format(

                human_tool=name,                    # Name des Tools aus der CSV
                human_desc=description,             # Beschreibung aus der CSV
                matched_tool=doc.metadata['name'],  # Name des gefundenen ähnlichen Tools
                matched_desc=doc.page_content       # Beschreibung des gefundenen Tools
            )
            prompts.append({
                'prompt': formatted_prompt,
                'metadata': {
                    'human_name': name,
                    'matched_tool': doc.metadata['name']
                }
            })

    # Batch Processing mit ThreadPoolExecutor
    def process_prompt(prompt):
        response = llm.invoke(prompt)
        return response.content

    with ThreadPoolExecutor() as executor:
        batch_texts = [p['prompt'] for p in prompts]
        all_responses = list(executor.map(process_prompt, batch_texts))
    
    # Verarbeite Responses
    for prompt_data, score in zip(prompts, all_responses):
        try:
            score_value = float(score)
            if score_value >= min_score:
                batch_results.append({
                    'human_name': prompt_data['metadata']['human_name'],
                    'tool_name': prompt_data['metadata']['matched_tool'],
                    'similarity_score': score_value
                })
        except ValueError:
            continue
    
    return batch_results

# Hauptverarbeitung
results = []

for i in tqdm(range(0, len(df_human), BATCH_SIZE)):
    batch = df_human.iloc[i:i+BATCH_SIZE]
    batch_results = process_batch(
        batch_rows=batch,
        vectorstore=vectorstore,
        min_score=MIN_SCORE,
        batch_size=BATCH_SIZE
    )
    results.extend(batch_results)

# Ergebnisse in DataFrame konvertieren
df_results = pd.DataFrame(results)

# Nach Similarity Score sortieren
df_results = df_results.sort_values('similarity_score', ascending=False)

# Als CSV speichern
df_results.to_csv('matching_results.csv', index=False, sep=';')
# df_results.to_csv('matching_results_2.csv', index=False, sep=';')

  from tqdm.autonotebook import tqdm
100%|██████████| 7/7 [03:04<00:00, 26.33s/it]


Performance-Evaluation

In [3]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score

def evaluate_matching(final_matches_file, ground_truth_file):
    # Dateien einlesen
    final_matches = pd.read_csv(final_matches_file, delimiter=';')
    ground_truth = pd.read_csv(ground_truth_file, delimiter=';')
    
    # Lowercasing und Trimmen von Strings
    final_matches['human_name'] = final_matches['human_name'].str.strip().str.lower()
    final_matches['tool_name'] = final_matches['tool_name'].str.strip().str.lower()
    ground_truth['Human Label'] = ground_truth['Human Label'].str.strip().str.lower()
    ground_truth['Tool Label'] = ground_truth['Tool Label'].str.strip().str.lower()

    # Matches in ein einheitliches Format bringen
    final_set = set(tuple(x) for x in final_matches[['human_name', 'tool_name']].values)
    truth_set = set(tuple(x) for x in ground_truth[['Human Label', 'Tool Label']].values)
    
    # True/False Positives/Negatives berechnen
    true_positives = len(final_set.intersection(truth_set))
    false_positives = len(final_set - truth_set)
    false_negatives = len(truth_set - final_set)
    
    # Listen der False Positives und False Negatives
    false_positives_list = list(final_set - truth_set)
    false_negatives_list = list(truth_set - final_set)

    # Metriken berechnen
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1_score,
        'True Positives': true_positives,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'False Positives List': false_positives_list,
        'False Negatives List': false_negatives_list
    }

# Evaluation durchführen
results = evaluate_matching('matching_results.csv', '../data/ground_truth.csv')
#results = evaluate_matching('matching_results_2.csv', '../data/ground_truth_2.csv')

# Ergebnisse ausgeben
for metric, value in results.items():
    if isinstance(value, list):
        print(f"{metric}:")
        for item in value:
            print(item)
    else:
        print(f"{metric}: {value:.3f}" if isinstance(value, float) else f"{metric}: {value}")


Precision: 0.915
Recall: 0.744
F1-Score: 0.821
True Positives: 119
False Positives: 11
False Negatives: 41
False Positives List:
('python', 'pycharm')
('libreoffice', 'libreoffice')
('microsoft power bi report builder', 'microsoft® report builder')
('textpad', 'textpad')
('inkscape', 'camtasia')
('crystal report runtime sp6', 'sap crystal')
('microsoft visual c++', 'visual studio')
('adobe photoshop', 'photoshop elements')
('articulate storyline', 'articulate 360')
('crystal reports activex viewer', 'report analyzer')
('gotoassist expert', 'goto meeting')
False Negatives List:
('microsoft power bi report builder', 'microsoft power bi desktop')
('adobe spark video', 'adobe creative cloud express')
('globalmeet', 'globalmeet webcast')
('jet excel add-in', 'jet reports')
('skype meetings', 'skype')
('adobe photoshop', 'adobe photoshop')
('adobe spark page', 'adobe creative cloud express')
('smartanalyzer app sdk', 'caseware idea')
('sophos ssl vpn client', 'sophos firewall')
('github desk