<a href="https://colab.research.google.com/github/ciaraadkins/llmatch/blob/main/LLMatch_POC_2_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LLMatch Algorithm 2.0 Methodology**
High level documentation can be found [here](https://docs.google.com/document/d/1CyBpXPGnkEDooIdKeGmA0kIvwJo_UZsEwLSMK9PZJkE/edit?usp=sharing).

More details on the methodology from a mathematical standpoint can be found [here](https://drive.google.com/file/d/1BLftZwvLb5t5QRNRGx6K0savZfLKZ_oL/view?usp=sharing).

Source data [here](https://docs.google.com/spreadsheets/d/1kL9vEnx4HXmjHeNy4dbFva-7MorQMhThfayFegDT914/edit?usp=sharing).

# **Init**

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [None]:
import numpy as np
from sklearn.metrics import pairwise_distances

In [None]:
%load_ext google.colab.data_table

In [None]:
# First pass mapping functions

def map_security(extra_secure):
    if extra_secure:
        # Favor open-source when extra security is needed
        return -1
    else:
        # Standard mapping not favoring one way or the other
        return 0.5

def map_budget(budget):
    return -1 if budget == "$ (low budget)" else 0.5

# Mapping functions for speed, affecting both throughput and latency
def map_throughput(speed):
    speeds = {"Speed doesn't matter to me.": 0.5, "Fast": 0.66, "Faster": 0.83, "Fastest": 1.0}
    return speeds.get(speed, 0.5)  # Default to median value if not specified

def map_latency(speed):
    speeds = {"Speed doesn't matter to me.": 0.5, "Fast": 0.34, "Faster": 0.17, "Fastest": 0.0}
    return speeds.get(speed, 0.5)  # Invert scale for latency

def map_freshness(freshness):
    return -1 if freshness == "I prefer models with a more recent knowledge cutoffs" else 0.5

def map_context_window(context_in,context_out):
    contexts = {"a sentence": -1, "a few sentences": 0.2, "a paragraph": 0.5,
                "a few paragraphs": 0.8, "more than a few paragraphs": 1, "Unsure": 0.5}

    return (contexts[context_in] + contexts[context_out])/2

def map_energy_usage(usage):
    # minimize parameters to minimize energy usage
    return -1 if usage == "Low" else 0.5

def map_coding(coding):
    return 1 if coding else -1

# Function to calculate weighted Euclidean distance
def weighted_euclidean_distance(v, u, w):
    return np.sqrt(np.sum(w * (v - u)**2))

# **Pre-Processing**

In [None]:
# Load the data
data_path = "/content/LLMatch Data 2.0 Workspace - final sample set.csv"
data = pd.read_csv(data_path)
data_clean = data.copy()

# do not use
data_dnu = data.copy()

In [None]:
# Convert 'context_window' to numeric (assuming it's formatted like '8,000')
data['context_window'] = data['context_window'].str.replace(',', '').astype(float)

# Convert 'parameters' to numeric
data['parameters'] = data['parameters'].str.replace(',', '').astype(float)

# Convert 'param_b' to numeric
data['param_b'] = data['param_b'].str.replace(',', '').astype(float)

# Map 'type' from categorical to numeric (0 for 'open-source', 1 for 'commercial')
data['type'] = data['type'].map({'open-source': 0, 'commercial': 1})

# Columns to be included in the vector
columns = ['type', 'parameters', 'freshness', 'context_window', 'humaneval', 'inf_price', 'throughput', 'latency']

# Normalize the data
scaler = MinMaxScaler()
data[columns] = scaler.fit_transform(data[columns])

# replace NaN humaneval column values with 0
data['humaneval'] = data['humaneval'].fillna(0)

# Handle any NaN values
for col in columns:
    data[col] = data[col].fillna(data[col].mean())

# Create a vector for each row
normalized_vectors = data[columns].values


# **Your Preferences**

In [None]:
### *****START User inputs*****

# @title ## Select what's most important to you when choosing an LLM:
### Maps to: inf_price
budget = "$ (low budget)" # @param ["$ (low budget)","Cost doesn't matter to me."]

### Maps to: throughput and latency
speed = "Faster"  # @param ["Fast","Faster","Fastest","Speed doesn't matter to me."]

### Maps to: cutoff_date or freshness
freshness = "Freshness doesn't matter to me."  # @param ["I prefer models with a more recent knowledge cutoff.","Freshness doesn't matter to me."]

### Maps to: context_window
# @markdown The largest **input** I'm expecting the model to process at a time is...
model_input = "a paragraph"  # @param ["a sentence","a few sentences","a paragraph","a few paragraphs","more than a few paraagraphs","Unsure"]

# @markdown The largest **output** I'm expecting the model to return at a time is...
model_output = "a paragraph"  # @param ["a sentence","a few sentences","a paragraph","a few paragraphs","more than a few paraagraphs","Unsure"]

# @markdown Keeping your energy usage low is also good for the environment ♻️
### Maps to: parameters
energy_usage = "Low"  # @param ["Low","Energy usage doesn't matter to me."]

### Maps to: type
extra_secure = False # @param {type:"boolean"}

### Maps to: humaneval
good_at_coding = True # @param {type:"boolean"}

### *****END User inputs*****

### *****START Vector Distance Ranking Algorithm*****

# User preferences weights
budget_weights = 0 if budget == "Cost doesn't matter to me." else 1
speed_weights = 0 if speed == "Speed doesn't matter to me." else 1
freshness_weights = 0 if freshness == "Freshness doesn't matter to me." else 1
energy_usage_weights = 0 if energy_usage == "Energy usage doesn't matter to me." else 1
secure_weights = 0 if not extra_secure else 1
coding_weights = 0 if not good_at_coding else 1

# Weights vector
weights = np.array([
    secure_weights*2,
    energy_usage_weights,
    freshness_weights,
    1,  # context_window always matters
    coding_weights*4,
    budget_weights*2,
    speed_weights,
    speed_weights
])


# User preferences settings vector by mapping the user inputs to numbers
user_vector = [
    # mapping security to type; if user wants security lean towards open-source
    map_security(extra_secure),

    # map energy usage to number of parameters; if use wants low energy usage then favor models with fewer parameters
    map_energy_usage(energy_usage),

    # map freshness to freshnness; if user care's about the model training data being more recent then they want a lower freshness number (bc this counts the number of days since the training data)
    map_freshness(freshness),

    # map mode input and output lengths to context window; if user needs larger inputs and outputs then they need larger context length
    map_context_window(model_input,model_output),  # Assuming input size affects context window

    # map good at coding flag to humaneval col; if the user needs the model to be good at coding, very highly favor models with higher values in humaneval
    map_coding(good_at_coding),

    # map a user's budget to the inf_price; if a user has a low budget then favor models with lower inference price
    map_budget(budget),  # Assuming budget affects price directly

    # map a user's speed preferences to the throughput; where if they want a speedier model then they need higher throughput
    map_throughput(speed),

    # also map a user's speed preferences to latency; if a user wants higher speed then they want lower latency
    map_latency(speed)    # Assuming speed affects both throughput and latency
]

# Compute distances
distances = np.array([weighted_euclidean_distance(v, user_vector, weights) for v in normalized_vectors])

# Combine distances with the data frame
data_clean['distance'] = distances

### ***** END Vector Distance Ranking Algorithm*****


### *****START Final Ranking w/ MMLU*****

# Fill MMLU NaN values with the column mean
data_clean['mmlu'] = data_clean['mmlu'].fillna(data_clean['mmlu'].mean())

# Assume MMLU is already normalized to 0-100
data_clean['mmlu_norm'] = data_clean['mmlu'] / 100  # Normalize if necessary

# Normalize distances
data_clean['distance_norm'] = (data_clean['distance'] - data_clean['distance'].min()) / (data_clean['distance'].max() - data_clean['distance'].min())

# Define a scoring function that combines distance and MMLU score
def calculate_score(norm_distance, norm_mmlu):
    return norm_mmlu*(1-norm_distance)

data_clean['score'] = data_clean.apply(lambda row: calculate_score(row['distance_norm'], row['mmlu_norm']), axis=1)

# Sort the data by score in descending order
sorted_data = data_clean.sort_values(by='score', ascending=False).reset_index(drop=True)

### ***** END Final Ranking w/ MMLU*****


### ***** START Results *****

# Print the top 3 closest models and their distances
print("Top 3 Models for You:")

# get top 3 models from sorted_data
for i in range(3):
  print(f"Rank {i+1}: Model Name - {sorted_data.iloc[i]['name']}")

print("-"*30)

# Drop distance column
# sorted_data = sorted_data.drop('distance', axis=1)


# drop distance, mmlu_norm, distance_norm from sorted_data
sorted_data = sorted_data.drop(['distance', 'mmlu_norm', 'distance_norm'], axis=1)

# update score by multiplying by 100 and making it an integer
sorted_data['score'] = (sorted_data['score'] * 100).astype(int)

# Display the sorted table (optional here, might be used in the actual application)
print("Top 10 Models with Details")
sorted_data.head(10)

### ***** END Results *****



Top 3 Models for You:
Rank 1: Model Name - Llama 3 (70B)
Rank 2: Model Name - Claude 3 Haiku
Rank 3: Model Name - Claude 3 Sonnet
------------------------------
Top 10 Models with Details


Unnamed: 0,name,type,parameters,param_b,param_inferred,cutoff_date,freshness,creator,context_window,mmlu,humaneval,inf_price,price_inferred,throughput,latency,score
0,Llama 3 (70B),open-source,70000000000,70.0,False,3/31/2023,403.0,OpenAI,8000,82.0,81.7,0.9,False,47.3,0.35,77
1,Claude 3 Haiku,commercial,20000000000,20.0,False,8/1/2023,280.0,Anthropic,200000,76.7,75.9,0.5,False,105.8,0.42,76
2,Claude 3 Sonnet,commercial,70000000000,70.0,False,8/1/2023,280.0,Anthropic,200000,81.5,73.0,6.0,False,59.6,0.51,74
3,GPT-4 Turbo,commercial,1402000000000,1402.0,True,12/1/2023,158.0,OpenAI,128000,86.4,85.4,15.0,False,18.6,0.56,71
4,Claude 2.1,commercial,1068000000000,1068.0,True,,,Anthropic,200000,78.5,71.2,12.0,False,40.8,0.5,65
5,Claude 2.0,commercial,1068000000000,1068.0,True,,,Anthropic,100000,78.5,71.2,12.0,False,38.0,0.51,64
6,GPT-3.5 Turbo,commercial,175000000000,175.0,False,9/30/2021,950.0,OpenAI,16000,70.0,73.2,0.75,False,48.7,0.35,64
7,Gemini 1.5 Pro,commercial,907000000000,907.0,True,3/1/2023,433.0,Google,1000000,81.9,71.9,10.5,False,43.9,1.26,63
8,Llama 2 Chat (70B),open-source,70000000000,70.0,False,7/1/2023,311.0,Meta,4000,68.9,62.2,1.0,False,64.3,0.32,59
9,Phi-3-mini-128k-instruct,open-source,3800000000,3.8,False,10/31/2023,189.0,Microsoft,128000,68.8,58.5,0.12,True,,,58
