In [148]:
# main libraries
import os
import json
import math
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from groq import Groq

# typing and classes
import instructor
from pydantic import BaseModel, Field
from typing import Optional, Literal, List
from enum import Enum 

# constants
load_dotenv("../.env")
GROQ_KEY = os.getenv("GROQ_KEY")
MODEL = "llama-3.3-70b-versatile"

# initialisation of model
client = Groq(api_key=GROQ_KEY)
client = instructor.from_groq(client, mode=instructor.Mode.JSON)
print("Groq client initialised.")

Groq client initialised.


In [106]:
cpu_df = pd.read_csv("../scraper/clean_data/cpu.csv")
cooler_df = pd.read_csv("../scraper/clean_data/cooler.csv")
storage_df = pd.read_csv("../scraper/clean_data/storage.csv")
memory_df = pd.read_csv("../scraper/clean_data/memory.csv")
motherboard_df = pd.read_csv("../scraper/clean_data/motherboard.csv")

In [107]:
cpu_requirements = json.load(open('./requirements/cpu_requirements.json', 'r'))
cooler_requirements = json.load(open('./requirements/cooler_requirements.json', 'r'))
storage_requirements = json.load(open('./requirements/storage_requirements.json', 'r'))
memory_requirements = json.load(open('./requirements/memory_requirements.json', 'r'))
motherboard_requirements = json.load(open('./requirements/motherboard_requirements.json', 'r'))

## Filter CSVs

In [108]:
def normalize_string(s):
    if isinstance(s, str):
        return s.lower().replace(" ", "_").replace("+", "plus").replace("-", "_")
    return s

In [123]:
def filter_cpu(
    df,
    min_cores=0,
    min_core_clock_ghz=0,
    min_boost_clock_ghz=0,
    microarchitecture=None,
    max_tdp_watts=math.inf,
    max_price=math.inf,
):
    filters = [
        df["core_count"] >= min_cores,
        df["performance_core_clock"] >= min_core_clock_ghz,
        df["performance_core_boost_clock"] >= min_boost_clock_ghz,
        df["tdp"] <= max_tdp_watts,
        df["price"].astype(float) <= max_price,
    ]

    if microarchitecture:
        filters.append(df['microarchitecture'].apply(normalize_string).isin(microarchitecture))

    return df.loc[pd.concat(filters, axis=1).all(axis=1)].reset_index(drop=True)


#############################################################################


def filter_cooler(
    df,
    min_fan_rpm=0,
    max_noise_level_db=math.inf,
    max_radiator_size_mm=math.inf,
    max_price=math.inf,
):
    filters = [
        df["average_fan_rpm"] >= min_fan_rpm,
        df["average_noise_level"] <= max_noise_level_db,
        df["radiator_size"] <= max_radiator_size_mm,
        df["price"].astype(float) <= max_price,
    ]

    return df.loc[pd.concat(filters, axis=1).all(axis=1)].reset_index(drop=True)


#############################################################################


def filter_storage(
    df,
    min_capacity_gb=0,
    preferred_type=None, 
    min_cache_gb=0,
    preferred_form_factor=None,
    preferred_interface=None,
    max_price_per_gb=math.inf
):

    filters = [
        df["capacity_gb"].astype(float) >= min_capacity_gb,
        df["cache_gb"].astype(float) >= min_cache_gb,
        df["price_per_gb"].astype(float) <= max_price_per_gb
    ]

    if preferred_type:
        filters.append(df["type"].apply(normalize_string).isin(preferred_type))

    if preferred_form_factor:
        filters.append(df["form_factor"].apply(normalize_string).isin(preferred_form_factor))

    if preferred_interface:
        filters.append(df["interface"].apply(normalize_string).isin(preferred_interface))

    return df.loc[pd.concat(filters, axis=1).all(axis=1)].reset_index(drop=True)


#############################################################################


def filter_memory(
    df,
    min_capacity_gb=0,
    min_speed_mhz=None, # TODO
    preferred_module_count=0,
    max_cas_latency=math.inf,
    max_price=math.inf
):
    filters = [
        df["total_ram"] >= min_capacity_gb,
        df["module_count"] >= preferred_module_count,
        df["cas_latency"] <= max_cas_latency,
        df["price"].astype(float) <= max_price,
    ]

    return df.loc[pd.concat(filters, axis=1).all(axis=1)].reset_index(drop=True)


#############################################################################


def filter_motherboard(
    df,
    preferred_socket=None,
    preferred_form_factor=None, 
    min_max_memory_gb=0,
    min_memory_slots=0,
    max_price=math.inf
):

    filters = [
        df["max_memory_gb"].astype(float) >= min_max_memory_gb,
        df["memory_slots"] >= min_memory_slots,
        df["price"] <= max_price,
    ]

    if preferred_socket:
        filters.append(df["cpu_socket"].apply(normalize_string).isin(preferred_socket))

    if preferred_form_factor:
        filters.append(df["form_factor"].apply(normalize_string).isin(preferred_form_factor))


    return df.loc[pd.concat(filters, axis=1).all(axis=1)].reset_index(drop=True)


# filter_memory(memory_df.copy(), **memory_requirements)



In [124]:
cpu_filtered = filter_cpu(cpu_df.copy(), **cpu_requirements)
cooler_filtered = filter_cooler(cooler_df.copy(), **cooler_requirements)
storage_filtered = filter_storage(storage_df.copy(), **storage_requirements)
memory_filtered = filter_memory(memory_df.copy(), **memory_requirements)
motherboard_filtered = filter_motherboard(motherboard_df.copy(), **motherboard_requirements)

In [126]:
cpu_filtered

Unnamed: 0,title,core_count,performance_core_clock,performance_core_boost_clock,microarchitecture,tdp,integrated_graphics,rating,price
0,AMD Ryzen 7 7800X3D,8,4.2,5.0,Zen 4,120,Radeon,4.5,391.12
1,AMD Ryzen 7 7700X,8,4.5,5.4,Zen 4,105,Radeon,4.5,268.6
2,AMD Ryzen 7 7700,8,3.6,5.3,Zen 4,65,Radeon,4.5,279.97
3,AMD Ryzen 9 7950X3D,16,4.2,5.7,Zen 4,120,Radeon,4.5,699.99
4,Intel Core i5-13600KF,14,3.5,5.1,Raptor Lake,125,,4.5,189.99
5,AMD Ryzen 7 8700G,8,4.2,5.1,Zen 4,65,Radeon 780M,5.0,303.0
6,AMD Ryzen 7 8700F,8,4.1,5.0,Zen 4,65,,5.0,259.95
7,AMD Ryzen 9 7900,12,3.6,5.4,Zen 4,65,Radeon,4.5,361.71
8,Intel Core i5-13600K,14,3.5,5.1,Raptor Lake,125,Intel UHD Graphics 770,4.5,230.0
9,AMD Ryzen 7 7800X3D,8,4.2,5.0,Zen 4,120,Radeon,,492.79


In [125]:
cooler_filtered

Unnamed: 0,title,rating,price,fan_rpm,noise_level,color,radiator_size,average_noise_level,average_fan_rpm
0,Thermalright Aqua Elite V3,4.5,54.90,1500 RPM,25.6 dB,Black,360.0,25.60,1500.0
1,Cooler Master MasterLiquid 360L Core ARGB,4.5,99.99,350 - 1750 RPM,27.2 dB,Black,360.0,27.20,1050.0
2,NZXT Kraken 360,4.5,179.99,500 - 1800 RPM,17.9 - 30.6 dB,Black,360.0,24.25,1150.0
3,NZXT Kraken 240,4.0,139.99,500 - 1800 RPM,17.9 - 30.6 dB,Black,240.0,24.25,1150.0
4,Thermalright Aqua Elite V3,5.0,44.90,1500 RPM,25.6 dB,Black,240.0,25.60,1500.0
...,...,...,...,...,...,...,...,...,...
197,In Win SR36 PRO,,164.07,500 - 2500 RPM,23 dB,Black,360.0,23.00,1500.0
198,In Win Nebula NR36,,149.00,400 - 1800 RPM,25 dB,Black / White,360.0,25.00,1100.0
199,In Win SR24,,112.00,500 - 2500 RPM,23 dB,Black,240.0,23.00,1500.0
200,In Win SR36,,130.00,500 - 2500 RPM,23 dB,Black,360.0,23.00,1500.0


In [113]:
storage_filtered

Unnamed: 0,title,rating,price,capacity,price_per_gb,type,cache,form_factor,interface,cache_gb,capacity_gb
0,Samsung 990 Pro,4.5,169.99,2 TB,0.085,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
3,Samsung 990 Pro,4.5,302.0,4 TB,0.075,SSD,4096 MB,M.2-2280,M.2 PCIe 4.0 X4,4.096,4000.0
28,Samsung 980 Pro,4.5,203.0,2 TB,0.102,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
53,Acer Predator GM7000,4.5,125.99,2 TB,0.063,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
68,Samsung 990 Pro w/Heatsink,5.0,312.99,4 TB,0.078,SSD,4096 MB,M.2-2280,M.2 PCIe 4.0 X4,4.096,4000.0
79,Samsung 990 Pro w/Heatsink,4.5,179.99,2 TB,0.09,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
160,Crucial P5 Plus,4.5,219.0,2 TB,0.109,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
178,MSI SPATIUM M480 PRO,5.0,154.99,2 TB,0.077,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
190,Gigabyte AORUS Gen4 7300,5.0,155.98,2 TB,0.078,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
277,MSI SPATIUM M480 PRO,5.0,279.99,4 TB,0.07,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,4000.0


In [114]:
memory_filtered

Unnamed: 0,title,rating,price,speed,modules,price_per_gb,color,first_word_latency,cas_latency,module_count,gb_per_module,total_ram
101,G.Skill Trident Z Neo 64 GB,4.5,169.99,DDR4-3600,4 x 16GB,2.656,Black / Silver,8.889 ns,16.0,4,16,64
159,Corsair Vengeance LPX 64 GB,4.0,129.99,DDR4-3200,4 x 16GB,2.031,Black / Yellow,10 ns,16.0,4,16,64
259,Corsair Vengeance LPX 64 GB,,137.99,DDR4-3600,4 x 16GB,2.156,Black,10 ns,18.0,4,16,64
377,Corsair Vengeance LPX 64 GB,5.0,125.99,DDR4-2666,4 x 16GB,1.969,Black / Yellow,12.003 ns,16.0,4,16,64
397,Corsair Vengeance RGB Pro SL 64 GB,5.0,167.99,DDR4-3600,4 x 16GB,2.625,Black,10 ns,18.0,4,16,64
419,Kingston FURY Beast 64 GB,5.0,136.06,DDR4-3200,4 x 16GB,2.126,Black,10 ns,16.0,4,16,64
843,Kingston ValueRAM 64 GB,,118.99,DDR4-2133,4 x 16GB,1.859,Black / Green,14.065 ns,15.0,4,16,64
1061,G.Skill FORTIS 64 GB,,150.9,DDR4-2400,4 x 16GB,2.358,Black / Red,12.5 ns,15.0,4,16,64


## Recommendation engine

In [158]:
system_prompt = """
You are a world-class hardware expert tasked with recommending a compatible and high-performance PC setup.

You are given three JSON arrays, consisting of details of CPUs, storage hard drives, and memory modules. From the list, choose only ONE component from each array, ensuring compatibility across all components that it meets the user's expectation and preference based on their input.

For each component, output the name, as well as the index number of its row.

You must only select from the given options. Do not invent anything new.
"""

class Component(BaseModel):
    index: int
    name: str 
    price: int

class ComponentChoices(BaseModel):
    cpu: Component 
    storage: Component 
    memory: Component 

message = "I want a decent PC rig that does things fast and has a lot of RAM!"

user_prompt = f"""The user inputted: {message}

Here are the component options:
CPUs: {json.dumps(cpu_filtered.reset_index().to_dict(orient="records"))}
Storage: {json.dumps(storage_filtered.reset_index().to_dict(orient="records"))}
Memory: {json.dumps(memory_filtered.reset_index().to_dict(orient="records"))}
"""


In [159]:
pc_build = client.chat.completions.create(
    model=MODEL,
    response_model=ComponentChoices,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
).model_dump()

pc_build


{'cpu': {'index': 3, 'name': 'AMD Ryzen 9 7950X3D', 'price': 700},
 'storage': {'index': 4, 'name': 'Samsung 990 Pro w/Heatsink', 'price': 313},
 'memory': {'index': 0, 'name': 'G.Skill Trident Z Neo 64 GB', 'price': 170}}

In [157]:
storage_filtered.reset_index()

Unnamed: 0,index,title,rating,price,capacity,price_per_gb,type,cache,form_factor,interface,cache_gb,capacity_gb
0,0,Samsung 990 Pro,4.5,169.99,2 TB,0.085,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
1,1,Samsung 990 Pro,4.5,302.0,4 TB,0.075,SSD,4096 MB,M.2-2280,M.2 PCIe 4.0 X4,4.096,4000.0
2,2,Samsung 980 Pro,4.5,203.0,2 TB,0.102,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
3,3,Acer Predator GM7000,4.5,125.99,2 TB,0.063,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
4,4,Samsung 990 Pro w/Heatsink,5.0,312.99,4 TB,0.078,SSD,4096 MB,M.2-2280,M.2 PCIe 4.0 X4,4.096,4000.0
5,5,Samsung 990 Pro w/Heatsink,4.5,179.99,2 TB,0.09,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
6,6,Crucial P5 Plus,4.5,219.0,2 TB,0.109,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
7,7,MSI SPATIUM M480 PRO,5.0,154.99,2 TB,0.077,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
8,8,Gigabyte AORUS Gen4 7300,5.0,155.98,2 TB,0.078,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,2000.0
9,9,MSI SPATIUM M480 PRO,5.0,279.99,4 TB,0.07,SSD,2048 MB,M.2-2280,M.2 PCIe 4.0 X4,2.048,4000.0
