# Project: LlamaCSE
## Group Members:
### Chukwudumebi Ubogu, e-mail: ubogu@student.chalmers.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Yunyi Xu, e-mail: yunyix@student.chalmers.se

In [None]:
    # Import libraries
    import os
    import json
    from sentence_transformers import SentenceTransformer
    import torch
    from typing import List, Dict
    import numpy as np
    import torch
    from sentence_transformers import SentenceTransformer
    from transformers import AutoTokenizer, AutoModelForCausalLM
    import numpy as np
    from typing import List, Dict, Tuple
    from dotenv import load_dotenv

    load_dotenv()

2025-01-03 15:17:21.575413: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-03 15:17:21.599305: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735913841.640959 3817686 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735913841.653666 3817686 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-03 15:17:21.689430: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with t

True

### Create Data Embeddings

In [2]:
class CourseDataProcessor:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        
    def load_json_data(self, file_path: str) -> List[Dict]:
        with open(file_path, 'r') as f:
            return json.load(f)
    
    def create_course_chunks(self, course_data: Dict) -> List[str]:
        """Create meaningful chunks from course data for embedding"""
        chunks = []
        
        # Basic course info chunk
        basic_info = f"Course {course_data['course_code']}: {course_data['course_name']} is a {course_data['credits']} credit course at {course_data['department']}. {course_data['course_content']}"
        chunks.append(basic_info)
        
        # Learning outcomes chunk
        if 'learning_outcomes' in course_data and course_data['learning_outcomes']:
            for outcome in course_data['learning_outcomes']:
                for category, details in outcome.items():
                    chunks.append(f"Learning outcome - {category}: {details}")
        
        # Requirements chunk
        if 'entry_requirements' in course_data:
            chunks.append(f"Entry requirements: {course_data['entry_requirements']}")
        
        # Teaching and assessment chunks
        if 'form_of_teaching' in course_data:
            chunks.append(f"Teaching format: {course_data['form_of_teaching']}")
        if 'assessment' in course_data:
            chunks.append(f"Assessment method: {course_data['assessment']}")
        
        return chunks
    
    def generate_embeddings(self, chunks: List[str]) -> torch.Tensor:
        """Generate embeddings for text chunks"""
        return self.model.encode(chunks, convert_to_tensor=True)
    
    def process_all_courses(self, courses_data: List[Dict]) -> Dict:
        """Process all courses and return embeddings with metadata"""
        all_chunks = []
        chunk_to_course_map = []
        
        for course in courses_data:
            course_chunks = self.create_course_chunks(course)
            all_chunks.extend(course_chunks)
            chunk_to_course_map.extend([course['course_code']] * len(course_chunks))
            
        embeddings = self.generate_embeddings(all_chunks)
        
        return {
            'embeddings': embeddings,
            'chunks': all_chunks,
            'course_map': chunk_to_course_map
        }
    
    def save_embeddings(self, embeddings_data: Dict, save_path: str):
        """Save embeddings and metadata"""
        torch.save({
            'embeddings': embeddings_data['embeddings'],
            'chunks': embeddings_data['chunks'],
            'course_map': embeddings_data['course_map']
        }, save_path)

In [3]:
processor = CourseDataProcessor()
courses = processor.load_json_data('data/json/merged_data.json')
embeddings_data = processor.process_all_courses(courses)
processor.save_embeddings(embeddings_data, 'data/embeddings/course_embeddings.pt')



### Develop RAG System

In [4]:
class CourseRAGSystem:
    def __init__(self, 
                 embedding_model_name: str = 'all-MiniLM-L6-v2',
                 llm_model_name: str = 'meta-llama/Llama-3.1-8B',
                 use_auth_token: str = os.getenv('HUGGINGFACE_TOKEN'),
                 device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.embedding_model.to(device)
        
        # Initialize tokenizer with auth token
        self.tokenizer = AutoTokenizer.from_pretrained(
            llm_model_name,
            token=use_auth_token,
            use_fast=True
        )
        
        # Initialize model with auth token and proper configuration
        self.model = AutoModelForCausalLM.from_pretrained(
            llm_model_name,
            token=use_auth_token,
            torch_dtype=torch.float16,  # Use half precision for memory efficiency
            device_map="auto",          # Automatically handle model placement
            load_in_8bit=True,         # Use 8-bit quantization to reduce memory usage
            trust_remote_code=True
        )
        
    def load_embeddings(self, path: str) -> Dict:
        """Load saved embeddings and metadata"""
        return torch.load(path)
    
    def find_relevant_chunks(self, 
                           query: str, 
                           embeddings_data: Dict,
                           top_k: int = 3) -> List[Tuple[str, str]]:
        """Find most relevant text chunks for a query"""
        # Generate embedding for the query
        query_embedding = self.embedding_model.encode(query, convert_to_tensor=True)
        
        # Calculate similarities
        similarities = torch.nn.functional.cosine_similarity(
            query_embedding.unsqueeze(0),
            embeddings_data['embeddings']
        )
        
        # Get top-k most similar chunks
        top_k_indices = torch.topk(similarities, k=top_k).indices
        
        relevant_chunks = []
        for idx in top_k_indices:
            chunk = embeddings_data['chunks'][idx]
            course_code = embeddings_data['course_map'][idx]
            relevant_chunks.append((chunk, course_code))
            
        return relevant_chunks
    
    def generate_response(self, query: str, relevant_chunks: List[Tuple[str, str]]) -> str:
        """Generate a response using the LLM"""
        context = "\n".join([f"From course {code}: {chunk}" for chunk, code in relevant_chunks])
        
        prompt = f"""[INST] You are a helpful course information assistant. Using only the following course information:

    {context}

    Answer this question: {query}

    Remember to:
    1. Only use information from the provided course data
    2. If the information isn't in the provided context, say so
    3. Be clear and concise[/INST]"""

        inputs = self.tokenizer(
            prompt, 
            return_tensors="pt", 
            truncation=True, 
            max_length=2048
        ).to(self.model.device)

        outputs = self.model.generate(
            input_ids=inputs['input_ids'],
            max_length=2048,
            num_return_sequences=1,
            temperature=0.1,
            top_p=0.95,
            do_sample=True,
            pad_token_id=self.tokenizer.pad_token_id,
            top_k=50,
            repetition_penalty=1.1
        )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    
    def query(self, question: str, embeddings_data: Dict) -> str:
        """Complete RAG pipeline"""
        relevant_chunks = self.find_relevant_chunks(question, embeddings_data)
        response = self.generate_response(question, relevant_chunks)
        return response

In [5]:
rag_system = CourseRAGSystem()
embeddings_data = rag_system.load_embeddings('data/embeddings/course_embeddings.pt')

# Test
question = "What are the prerequisites for the Computer Security course?"
response = rag_system.query(question, embeddings_data)
print(response)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  return torch.load(path)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[INST] You are a helpful course information assistant. Using only the following course information:

    From course DIT642: Course DIT642: Computer Security is a 7.5 credit course at Department of Computer Science and Engineering. The course gives basic knowledge in the security area, i.e. how to protect your system against intrusions and attacks. The purpose of intrusions can be to change or delete resources (data, programs, hardware, etc), to get unauthorized access to confidential information or unauthorized use of the system's services. The course covers threats and vulnerabilities in the computer systems and networks, as well as rules, methods and mechanisms for protection. Modeling and assessment of security and dependability as well as metrication methods are covered. A holistic security approach is taken and organizational, business-related, social, human, legal and ethical aspects are treated. The following topics will be covered, among others. Introduction to computer securi