# Project: LlamaCSE
## Group Members:
### Chukwudumebi Ubogu, e-mail: ubogu@student.chalmers.se
### Francisco Alejandro Erazo Piza, e-mail: guserafr@student.gu.se
### Nils Dunlop, e-mail: gusdunlni@student.gu.se
### Yunyi Xu, e-mail: yunyix@student.chalmers.se

In [1]:
# Import libraries
import json
from sentence_transformers import SentenceTransformer
import torch
from typing import List, Dict
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from typing import List, Dict, Tuple

2025-01-03 14:05:12.412763: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-03 14:05:12.436755: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735909512.478603 3800488 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735909512.491374 3800488 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-03 14:05:12.527353: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with t

### Create Data Embeddings

In [2]:
class CourseDataProcessor:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        
    def load_json_data(self, file_path: str) -> List[Dict]:
        with open(file_path, 'r') as f:
            return json.load(f)
    
    def create_course_chunks(self, course_data: Dict) -> List[str]:
        """Create meaningful chunks from course data for embedding"""
        chunks = []
        
        # Basic course info chunk
        basic_info = f"Course {course_data['course_code']}: {course_data['course_name']} is a {course_data['credits']} credit course at {course_data['department']}. {course_data['course_content']}"
        chunks.append(basic_info)
        
        # Learning outcomes chunk
        if 'learning_outcomes' in course_data and course_data['learning_outcomes']:
            for outcome in course_data['learning_outcomes']:
                for category, details in outcome.items():
                    chunks.append(f"Learning outcome - {category}: {details}")
        
        # Requirements chunk
        if 'entry_requirements' in course_data:
            chunks.append(f"Entry requirements: {course_data['entry_requirements']}")
        
        # Teaching and assessment chunks
        if 'form_of_teaching' in course_data:
            chunks.append(f"Teaching format: {course_data['form_of_teaching']}")
        if 'assessment' in course_data:
            chunks.append(f"Assessment method: {course_data['assessment']}")
        
        return chunks
    
    def generate_embeddings(self, chunks: List[str]) -> torch.Tensor:
        """Generate embeddings for text chunks"""
        return self.model.encode(chunks, convert_to_tensor=True)
    
    def process_all_courses(self, courses_data: List[Dict]) -> Dict:
        """Process all courses and return embeddings with metadata"""
        all_chunks = []
        chunk_to_course_map = []
        
        for course in courses_data:
            course_chunks = self.create_course_chunks(course)
            all_chunks.extend(course_chunks)
            chunk_to_course_map.extend([course['course_code']] * len(course_chunks))
            
        embeddings = self.generate_embeddings(all_chunks)
        
        return {
            'embeddings': embeddings,
            'chunks': all_chunks,
            'course_map': chunk_to_course_map
        }
    
    def save_embeddings(self, embeddings_data: Dict, save_path: str):
        """Save embeddings and metadata"""
        torch.save({
            'embeddings': embeddings_data['embeddings'],
            'chunks': embeddings_data['chunks'],
            'course_map': embeddings_data['course_map']
        }, save_path)

In [3]:
processor = CourseDataProcessor()
courses = processor.load_json_data('data/json/merged_data.json')
embeddings_data = processor.process_all_courses(courses)
processor.save_embeddings(embeddings_data, 'data/embeddings/course_embeddings.pt')

