In [1]:
import pandas

In [2]:
import openai

In [3]:
import os
from openai import OpenAI

def call_openai_llm(prompt, model="gpt-4", temperature=0.7):
    """
    Call OpenAI's language model with the given prompt.
    
    Args:
        prompt (str): The input prompt to send to the model
        model (str): The OpenAI model to use (default: gpt-4)
        temperature (float): Controls randomness in the output (0.0-1.0)
        
    Returns:
        str: The model's response text
    """
    try:
        # Load API key from .env file
        from dotenv import load_dotenv
        load_dotenv()
        
        # Initialize the client
        client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
        
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error calling OpenAI API: {str(e)}")
        return None


In [4]:
call_openai_llm("Hello, world!")

'Hello! How can I assist you today?'

In [5]:
import os
import hashlib
import json
import pandas as pd
from datetime import datetime

# --- Configuration --- #
class Config:
    RAW_NOTES_DIR = "raw_notes"
    ORGANIZED_NOTES_DIR = "organised_notes"
    MAPPING_CSV = "note_mapping.csv"
    
    CHAPTER_TAGS = {
        "chapter_1": "#lifestyle",
        "chapter_2": "#heart"
    }
    
    SECTION_TAGS = {"chapter_1_section_1": CHAPTER_TAGS['chapter_1'] + "-introduction",
                    "chapter_1_section_2": CHAPTER_TAGS['chapter_1'] + "-analysis",
                    "chapter_1_section_3": CHAPTER_TAGS['chapter_1'] + "-conclusion",
                    "chapter_2_section_1": CHAPTER_TAGS['chapter_2'] + "-introduction",
                    "chapter_2_section_2": CHAPTER_TAGS['chapter_2'] + "-analysis",
                    "chapter_2_section_3": CHAPTER_TAGS['chapter_2'] + "-conclusion"}
    
    ADDITIONAL_TAGS = [
        "#todo", "#thesis", "#research", "#datasets", 
        "#ideas", "#tools", "#conferences"
    ]
    
    @staticmethod
    def get_prompt_template(note_content):
        return f"""
You are a categorization assistant specialized in organizing research and PhD thesis notes.
You have a predefined folder hierarchy:
- todo
- thesis: which contains chapters (e.g., chapter_1_tag, chapter_2_tag, etc.). Inside each chapter the note can be further organized into:
    - datasets
    - research_papers
    - ideas
- general: which contains:
    - datasets
    - research_papers
    - ideas
    - tools
    - conferences

The thesis has the following chapter tags:
{json.dumps(Config.CHAPTER_TAGS, indent=2)}

And section tags:
{json.dumps(Config.SECTION_TAGS, indent=2)}

The additional tags for notes are:
{json.dumps(Config.ADDITIONAL_TAGS, indent=2)}

Analyze the following note's content. The note may contain some tags (e.g., #todo or #research) that provide hints about its content.

From the note, please decide:
1. The overall category for the note (it can be "todo", "thesis", or "general").
2. If it belongs under "thesis", assign a chapter such as "{Config.CHAPTER_TAGS['chapter_1']}" or "{Config.CHAPTER_TAGS['chapter_2']}" and, if applicable, a further subcategory (one of: "datasets", "research papers", "ideas").
3. If it belongs under "general", choose one subcategory from ("datasets", "research papers", "ideas", "tools", "conferences").
4. Optionally, update the tags using only those from the allowed set.
5. Suggest an appropriate title for the note.

Please output a JSON string with exactly these keys:
- "folder_structure": a string defining the folder path relative to the organized notes directory (for example, "thesis/chapter_1/ideas", "todo", or "general/tools").
- "file_title": a string representing the new title for the note (do not include a file extension).
- "tags": a list of strings where each string is a tag (starting with '#') from the allowed set that apply to this note.

Here is the note content:
{note_content}
"""

class FileUtils:
    @staticmethod
    def compute_file_hash(file_path):
        """Compute MD5 hash of the file content to detect changes."""
        hasher = hashlib.md5()
        with open(file_path, "rb") as f:
            buffer = f.read()
            hasher.update(buffer)
        return hasher.hexdigest()
    
    @staticmethod
    def sanitize_filename(filename):
        """Ensure filename is safe (alphanumeric and selected characters)"""
        return "".join(c for c in filename if c.isalnum() or c in (' ', '_', '-')).rstrip()

class LLMHandler:
    @staticmethod
    def get_llm_response(note_content):
        """Get categorization from OpenAI's GPT API"""
        prompt = Config.get_prompt_template(note_content)
        try:
            response = call_openai_llm(prompt, temperature=0.5)
            if response:
                try:
                    return json.loads(response)
                except json.JSONDecodeError:
                    import re
                    json_match = re.search(r'\{.*\}', response, re.DOTALL)
                    if json_match:
                        return json.loads(json_match.group())
                    raise ValueError("Unable to extract JSON from LLM response.")
        except Exception as e:
            print(f"Error in LLM API call: {e}")
            return None

class NoteWriter:
    @staticmethod
    def write_organized_note(note_content, llm_result, raw_file_path):
        """Write note to organized folder structure"""
        folder_structure = llm_result.get("folder_structure", "general")
        file_title = FileUtils.sanitize_filename(llm_result.get("file_title", "untitled"))
        
        dest_dir = os.path.join(Config.ORGANIZED_NOTES_DIR, folder_structure)
        os.makedirs(dest_dir, exist_ok=True)
        dest_file_path = os.path.join(dest_dir, f"{file_title}.md")
        
        raw_note_link = f"[[{os.path.relpath(raw_file_path, start=Config.ORGANIZED_NOTES_DIR)}]]"
        new_tags = llm_result.get("tags", [])
        tags_str = "\n".join(new_tags) if new_tags else ""
        
        organized_content = f"{tags_str}\n\n{raw_note_link}\n\n{note_content}"
        
        with open(dest_file_path, "w", encoding="utf-8") as f:
            f.write(organized_content)
        
        return dest_file_path

class MappingManager:
    @staticmethod
    def load_mapping():
        """Load CSV mapping file"""
        if os.path.exists(Config.MAPPING_CSV):
            return pd.read_csv(Config.MAPPING_CSV)
        return pd.DataFrame(columns=["raw_note_path", "organized_note_path", "file_hash", "processed_time", "llm_response"])
    
    @staticmethod
    def update_mapping(mapping_df):
        """Save mapping DataFrame to CSV"""
        mapping_df.to_csv(Config.MAPPING_CSV, index=False)
    
    @staticmethod
    def create_mapping_entry(raw_file_path, organized_note_path, file_hash, llm_result):
        """Create new mapping DataFrame entry"""
        return pd.DataFrame([{
            "raw_note_path": raw_file_path,
            "organized_note_path": organized_note_path,
            "file_hash": file_hash,
            "processed_time": datetime.now().isoformat(),
            "llm_response": json.dumps(llm_result)
        }])

class NoteProcessor:
    def __init__(self):
        self.mapping_df = MappingManager.load_mapping()
        self.processed_notes = dict(zip(self.mapping_df["raw_note_path"], self.mapping_df["file_hash"]))
    
    def process_single_note(self, raw_file_path):
        """Process a single note file"""
        file_hash = FileUtils.compute_file_hash(raw_file_path)
        
        if raw_file_path in self.processed_notes and self.processed_notes[raw_file_path] == file_hash:
            print(f"Skipping unchanged note: {raw_file_path}")
            return
            
        print(f"Processing note: {raw_file_path}")
        try:
            with open(raw_file_path, "r", encoding="utf-8") as f:
                note_content = f.read()
        except Exception as e:
            print(f"Error reading {raw_file_path}: {e}")
            return
            
        llm_result = LLMHandler.get_llm_response(note_content)
        if llm_result is None:
            print(f"LLM did not return a valid response for note: {raw_file_path}")
            return
            
        organized_note_path = NoteWriter.write_organized_note(note_content, llm_result, raw_file_path)
        print(f"Note organized to: {organized_note_path}")
        
        new_row = MappingManager.create_mapping_entry(raw_file_path, organized_note_path, file_hash, llm_result)
        
        if raw_file_path in self.processed_notes:
            self.mapping_df.loc[self.mapping_df["raw_note_path"] == raw_file_path, :] = new_row.iloc[0]
        else:
            self.mapping_df = pd.concat([self.mapping_df, new_row], ignore_index=True)
        
        MappingManager.update_mapping(self.mapping_df)
    
    def process_all_notes(self):
        """Process all notes in the raw notes directory"""
        for root, _, files in os.walk(Config.RAW_NOTES_DIR):
            for file in files:
                if file.endswith(".md"):
                    raw_file_path = os.path.join(root, file)
                    self.process_single_note(raw_file_path)

if __name__ == "__main__":
    processor = NoteProcessor()
    processor.process_all_notes()

Processing note: raw_notes/coffee thought.md
Note organized to: organised_notes/thesis/chapter_1/ideas/Coffee Consumption in Lifestyle Surveys.md
Processing note: raw_notes/intro paragraph idea.md
Note organized to: organised_notes/thesis/chapter_1_tag/ideas/Introduction Idea - WHO Smoking Statistics.md
Processing note: raw_notes/lit review note smith2023.md
Note organized to: organised_notes/thesis/chapter_2/research_papers/Review_of_Smiths_Nicotine_Dependence_Measurement_Method.md
Processing note: raw_notes/epi conf thing sept.md
Note organized to: organised_notes/general/conferences/Notes from London Epidemiology Conference.md
Processing note: raw_notes/amsterdam workshop notes.md
Note organized to: organised_notes/general/conferences/Causal Inference Workshop Notes.md
Processing note: raw_notes/!!!.md
Note organized to: organised_notes/general/research_papers/Found_Paper_Reference.md
Processing note: raw_notes/citation format.md
Note organized to: organised_notes/todo/Update Zotero