In [6]:
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage, Document, Settings
from llama_index.llms.openai import OpenAI
from llama_index.core import ServiceContext
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
from llama_index.core.prompts.prompts import SimpleInputPrompt
import torch
import nest_asyncio
nest_asyncio.apply()

In [7]:
# os.environ['OPENAI_API_KEY'] = ''
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_path = '../Model/tinylmma-1b'
model_name_litteral = "tinylmma-1b"

In [8]:
docs = SimpleDirectoryReader('../Summarizer result/').load_data()

In [9]:
llm = OpenAI(temperature=1, model="gpt-3.5-turbo-16k")
# service_context = ServiceContext.from_defaults(llm=llm)

In [10]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(
    model_name="google-bert/bert-base-uncased", 
    device="cuda",
    max_length=2048
)

No sentence-transformers model found with name google-bert/bert-base-uncased. Creating a new one with MEAN pooling.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
Settings.embed_model = embed_model # Load using embended model that has been defined
storage_context = StorageContext.from_defaults(persist_dir="../VectorizedData/bart-base/" ) #isi dengan directory tempat dataset
index = load_index_from_storage(storage_context)

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt

system_prompt = """# Flan Model
- The Flan Model is a multifunctional language model engineered for a range of applications, including gaming-related tasks.
- This model excels at managing a variety of queries and producing precise, contextually appropriate responses in various settings, including gaming platforms such as Steam.
- Flan Model utilizes an advanced architecture customized for optimal performance in natural language processing, guaranteeing swift text processing and rapid response times in gaming interactions.
"""
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

query_wrapper_prompt = SimpleInputPrompt("{query_str}")

# Initialize the HuggingFaceLLM
localLLM = HuggingFaceLLM(
    context_window=450, 
    max_new_tokens=100,
    system_prompt=system_prompt,
    generate_kwargs={"temperature": 0.2, "do_sample": True},
    model_name=model_name,
    model=model,
    tokenizer=tokenizer,
    tokenizer_kwargs={"max_length": 750, "truncation": True},
    model_kwargs={"torch_dtype": torch.float32, "pad_token_id": tokenizer.pad_token_id, "device": "cuda"},
)

print("mask_token_id:", tokenizer.mask_token_id)
print("sep_token_id:", tokenizer.sep_token_id)
print("pad_token_id:", tokenizer.pad_token_id)
print("eos_token_id:", tokenizer.eos_token_id)
print("cls_token_id:", tokenizer.cls_token_id)
query_engine = index.as_query_engine(similarity_top_k=3, llm=localLLM)

The model `google/flan-t5-large` and tokenizer `StabilityAI/stablelm-tuned-alpha-3b` are different, please ensure that they are compatible.


mask_token_id: None
sep_token_id: None
pad_token_id: 0
eos_token_id: 1
cls_token_id: None


BatchEvalRunner

In [13]:
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
)
from llama_index.core.evaluation import DatasetGenerator

In [14]:
faithfulness = FaithfulnessEvaluator(llm=localLLM)
relevancy = RelevancyEvaluator(llm=localLLM)
correctness = CorrectnessEvaluator(llm=localLLM)

In [15]:
dataset_generator = DatasetGenerator.from_documents(docs, llm=localLLM)

qas = dataset_generator.generate_dataset_from_nodes(num=3)

  return cls(
  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [16]:
embeddings = embed_model.get_text_embedding("hi abc")
print(f"Dimension of embeddings: {len(embeddings)}")

Dimension of embeddings: 768


In [17]:
from llama_index.core.evaluation import BatchEvalRunner

runner = BatchEvalRunner(
    {"faithfulness": faithfulness, "relevancy": relevancy},
    workers=8,
)

eval_results = await runner.aevaluate_queries(
    index.as_query_engine(similarity_top_k=3, llm=localLLM), queries=qas.questions
)


In [18]:
def get_eval_results(key, eval_results):
    results = eval_results[key]
    correct = 0
    for result in results:
        if result.passing:
            correct += 1
    score = correct / len(results)
    print(f"{key} Score: {score}")
    return score

Correcteness Test

In [19]:
from llama_index.core.evaluation import CorrectnessEvaluator

In [20]:
llmGPT = OpenAI(temperature=1, model="gpt-3.5-turbo-16k")
evaluator = CorrectnessEvaluator(llm=llmGPT)

In [21]:
queries = [
    (
        "Can you describe the gameplay of Stardew Valley in detail?",
        """
        In Stardew Valley, players inherit a run-down farm and must restore it by planting crops, raising animals, mining, fishing, and crafting. They also interact with the local townspeople, build relationships, and participate in seasonal events. The game combines farming simulation with social elements and offers a variety of activities to engage in throughout the in-game year.
        """
    ),
    (
        "What is the storyline of Doki Doki Literature Club?",
        """
        Doki Doki Literature Club starts as a lighthearted dating sim where you join a high school literature club and interact with four female members. However, the game takes a dark turn, revealing psychological horror elements as it progresses. The story delves into themes of mental illness, manipulation, and the breaking of the fourth wall.
        """
    ),
    (
        "What is the main gameplay of Celeste?",
        """
        Celeste is a platformer game where players control a character named Madeline as she climbs a mountain. The gameplay focuses on precise jumping, dashing, and climbing mechanics, with challenging levels designed to test the player's skills. The game also explores themes of mental health, perseverance, and self-discovery through its narrative.
        """
    ),
    (
        "What is the basic premise of Hollow Knight?",
        """
        Hollow Knight is an action-adventure game set in the mysterious, underground kingdom of Hallownest. Players control a silent, insect-like knight who explores a vast, interconnected world filled with secrets, enemies, and powerful bosses. The game features tight platforming, combat, and an emphasis on exploration and discovery.
        """
    ),
    (
        "What is the gameplay style of Hades?",
        """
        Hades is a rogue-like dungeon crawler where players control Zagreus, the son of Hades, as he attempts to escape the Underworld. The game features fast-paced combat, with a variety of weapons and abilities to choose from. Each escape attempt is procedurally generated, offering unique challenges and rewards. The game also includes a strong narrative element, with characters and storylines that develop over multiple runs.
        """
    ),
    (
        "What is the unique aspect of Undertale's gameplay?",
        """
        Undertale is an RPG where players control a child who has fallen into the Underground, a world filled with monsters. The game is known for its unique combat system, which allows players to choose between fighting or peacefully resolving conflicts with enemies. The choices players make significantly impact the game's story and outcomes, leading to multiple possible endings.
        """
    ),
    (
        "What makes The Legend of Zelda: Breath of the Wild unique?",
        """
        The Legend of Zelda: Breath of the Wild is an open-world action-adventure game set in the kingdom of Hyrule. Players control Link as he explores a vast, open world filled with diverse landscapes, enemies, and puzzles. The game emphasizes freedom and player choice, allowing players to tackle challenges in any order and experiment with different strategies and solutions. The game also features a dynamic weather system and physics-based interactions.
        """
    ),
    (
        "What is the gameplay experience of Dark Souls like?",
        """
        Dark Souls is an action RPG known for its challenging difficulty and deep lore. Players control a customizable character who explores a dark, interconnected world filled with deadly enemies and bosses. Combat is deliberate and requires precise timing and strategy. The game also features a unique multiplayer system where players can leave messages, assist, or invade other players' worlds.
        """
    ),
    (
        "What is the main focus of The Witcher 3: Wild Hunt?",
        """
        The Witcher 3: Wild Hunt is an open-world RPG that follows Geralt of Rivia, a monster hunter, as he searches for his adopted daughter, Ciri. The game features a vast, detailed world filled with quests, monsters, and characters. The story is highly immersive, with branching narratives and meaningful choices that affect the outcome. Combat involves a mix of swordplay, magic, and alchemy.
        """
    ),
    (
        "What is the gameplay of Portal 2?",
        """
        Portal 2 is a first-person puzzle-platform game where players control Chell, a test subject in the Aperture Science facility. The gameplay revolves around using a portal gun to create linked portals on surfaces, solving puzzles, and navigating through the facility. The game features a single-player campaign with a rich narrative and a cooperative multiplayer mode with unique puzzles designed for two players.
        """
    ),
]
dataset = [
    {"query": "What do players do in 'Stardew Valley'?", "ground_truth": "Players farm, raise animals, and make friends with townsfolk."},
    {"query": "What is surprising about 'Doki Doki Literature Club'?", "ground_truth": "It turns from a dating sim into a psychological horror game."},
    {"query": "What can players choose to do instead of fighting in 'Undertale'?", "ground_truth": "Players can choose to befriend enemies."},
    {"query": "What do players love about 'Hades'?", "ground_truth": "Players love the exciting rogue-like gameplay and interesting story."},
    {"query": "How is the world in 'The Witcher 3: Wild Hunt' described?", "ground_truth": "The world is detailed and immersive."},
    {"query": "What is challenging in 'Celeste'?", "ground_truth": "The platforming levels are very challenging."},
    {"query": "What do players often praise in 'Hollow Knight'?", "ground_truth": "Players praise the detailed levels and deep story."},
    {"query": "What tool is central to 'Portal'?", "ground_truth": "The portal gun is central to solving puzzles."},
    {"query": "What do players like about 'Slay the Spire'?", "ground_truth": "Players like the strategy of building and using card decks."},
    {"query": "What is special about 'Disco Elysium'?", "ground_truth": "It has a deep story with lots of choices."}
]

In [25]:
CorrectScore = 0.0
for i in range(10):
    response = query_engine.query(queries[i][0])
    result = evaluator.evaluate(
        query = queries[i][0],
        response=str(response),
        reference=queries[i][1]
    )
    print(result.score)
    CorrectScore+=result.score
CorrectScore = CorrectScore/10.0

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


Score Results

In [None]:
import numpy as np
from collections import Counter

def calculate_exact_match(response, ground_truth):
    return int(response.strip().lower() == ground_truth.strip().lower())

def calculate_f1(response, ground_truth):
    response_tokens = response.lower().split()
    ground_truth_tokens = ground_truth.lower().split()
    common = Counter(response_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(response_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

def evaluate_query_engine(query_engine, dataset):
    # Inisialisasi list untuk menyimpan skor
    em_scores = []  # Exact Match Scores
    f1_scores = []  # F1 Scores
    recall_scores = []  # Recall Scores
    precision_scores = []  # Precision Scores

    # Iterasi melalui setiap data di dataset
    for data in dataset:
        query = data['query']  # Ambil query dari dataset
        ground_truth = data['ground_truth']  # Ambil ground truth dari dataset

        # Dapatkan respons dari query engine
        response_obj = query_engine.query(query)

        # Ekstrak teks dari objek respons
        # Modifikasi baris ini sesuai dengan cara objek respons menyimpan teks
        response_text = response_obj.text if hasattr(response_obj, 'text') else str(response_obj)

        # Hitung skor Exact Match
        em = calculate_exact_match(response_text, ground_truth)
        # Hitung skor F1
        f1 = calculate_f1(response_text, ground_truth)

        # Hitung skor Recall
        recall = len(set(response_text) & set(ground_truth)) / len(set(ground_truth))
        # Hitung skor Precision
        precision = len(set(response_text) & set(ground_truth)) / len(set(response_text)) if response_text else 0

        # Tambahkan skor ke list masing-masing
        em_scores.append(em)
        f1_scores.append(f1)
        recall_scores.append(recall)
        precision_scores.append(precision)

    # Hitung rata-rata untuk setiap skor
    avg_em = np.mean(em_scores)
    avg_f1 = np.mean(f1_scores)
    avg_recall = np.mean(recall_scores)
    avg_precision = np.mean(precision_scores)

    # Kembalikan rata-rata skor
    return avg_em, avg_f1, avg_recall, avg_precision


avg_em, avg_f1, avg_recall, avg_precision = evaluate_query_engine(query_engine, dataset)
# Sekarang, cetak nilai-nilai tersebut
print(f"Average Exact Match (EM): {avg_em}")
print(f"Average F1 Score: {avg_f1}")
print(f"Average Recall: {avg_recall}")
print(f"Average Precision: {avg_precision}")


Average Exact Match (EM): 0.0
Average F1 Score: 0.10992676191317956
Average Recall: 0.9150356282864024
Average Precision: 0.6487986488473867


In [26]:
score = get_eval_results("faithfulness", eval_results)
score = get_eval_results("relevancy", eval_results)
print("correctness score : ",CorrectScore )

faithfulness Score: 0.0
relevancy Score: 0.0
correctness score :  1.0
