In [None]:
import json, mimetypes
from pathlib import Path
import fitz          # PyMuPDF – only imported if a PDF is detected
DOCUMENTS_DIR = Path.home() / "Documents"


excepted_extensions = [".pdf", ".txt", ".py", ".md", ".ipynb", ".Rmd", ".R", ".r", ".csv", ".xlsx", ".xls", ".docx", ".pptx", ".zip", ".json"]
def list_top_level_files(directory: Path):
    """Return Path objects for files directly inside `directory` (no sub-folders)."""
    return [
        f for f in directory.iterdir()
        if f.is_file() and not f.name.startswith('.') and not f.parent.name.startswith('.')
        and f.suffix.lower() in excepted_extensions
    ]

def preview_file(path: Path, *, max_chars: int = 2000) -> str:
    """
    Return a textual preview for any file.
    • .ipynb → extracts markdown + code cells
    • text/* → first N chars
    • application/pdf → first page text
    • everything else → placeholder
    """
    try:
        #Jupyter notebooks
        if path.suffix.lower() == ".ipynb":
            with open(path, "r", encoding="utf-8") as f:
                notebook = json.load(f)

            pieces = []
            for cell in notebook.get("cells", []):
                if cell["cell_type"] in ("markdown", "code"):
                    tag = "[Markdown]" if cell["cell_type"] == "markdown" else "[Code]"
                    content = "".join(cell["source"]).strip()
                    pieces.append(f"{tag}\n{content}")
            return "\n\n".join(pieces)[:max_chars]

        #Use MIME type for everything else
        mime, _ = mimetypes.guess_type(str(path))

        # Plain text (includes .py, .txt, .md…)
        if mime and mime.startswith("text"):
            return path.read_text(encoding="utf-8", errors="ignore")[:max_chars]

        # PDF
        if mime == "application/pdf":
            with fitz.open(path) as doc:
                return doc[0].get_text()[:max_chars]

        # Fallback
        return f"[{mime or 'unknown'} file]"

    except Exception as exc:
        return f"[Error reading file: {exc}]"

if __name__ == "__main__":
    for f in list_top_level_files(DOCUMENTS_DIR):
        print(f"📄 {f.name}")
        print(f"   Path: {f}")
        print(f"   Preview:\n{preview_file(f, max_chars=500)}")
        print("-" * 70)


📄 .RData
   Path: C:\Users\gabel\Documents\.RData
   Preview:
[unknown file]
----------------------------------------------------------------------
📄 .Rhistory
   Path: C:\Users\gabel\Documents\.Rhistory
   Preview:
[unknown file]
----------------------------------------------------------------------
📄 04.12.2023_22.06.45_REC.mp4
   Path: C:\Users\gabel\Documents\04.12.2023_22.06.45_REC.mp4
   Preview:
[video/mp4 file]
----------------------------------------------------------------------
📄 6824.txt
   Path: C:\Users\gabel\Documents\6824.txt
   Preview:
6/8/24

I hate my current situation, my life. I feel as though my purpose is nothing and anything I do I can't seem to get a grasp on, as if I am too stupid to understand anything. I watch YouTube over's lives, whilst my own is stagnant and watching it pass by me, and because I choose to do nothing I feel as though life has become this meaningless blob of existence where I see everyone else exist except for myself, a form of depersonali

In [None]:
"""
Phase 2 – scan top-level files in ~/Documents, preview each, call GPT-4 to get
  • 2-3-sentence summary
  • 2-3 category tags
  • concise suggested title (no extension)

Outputs a list of dicts, one per file, e.g.

{
    "filename": "Doc1.pdf",
    "filepath": "/Users/you/Documents/Doc1.pdf",
    "filetype": "pdf",
    "mime_type": "application/pdf",
    "preview": "First 500 chars …",
    "summary": "…",
    "suggested_title": "2023_Federal_Tax_Return",
    "suggested_tags": ["Finance", "Taxes"]
}
"""
import os, json, mimetypes, shutil
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI

#Set path to the .env file
dotenv_path = "C:\\Users\\gabel\\Documents\\File Organizer AI\\.env"
#Check if the .env file exists
if not os.path.exists(dotenv_path):
    raise FileNotFoundError("Please create a .env file with your OpenAI API key.")
load_dotenv(); 
#Check if the .env file is loaded
if not os.path.exists(".env"):
    raise FileNotFoundError("Please create a .env file with your OpenAI API key.")
# Load OpenAI API key from .env file
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("Please set the OPENAI_API_KEY environment variable in your .env file.")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#DOCUMENTS_DIR = Path.home() / "Documents"

def gpt_summarize_and_tag(text: str) -> dict:
    """
    Ask GPT-4 to return JSON with keys 'summary', 'suggested_title', 'suggested_tags'.
    The model response is parsed and returned as a Python dict.
    """
    system = (
        "You are an assistant that summarizes files and suggests filenames & tags. "
        "Always reply **only** with valid JSON having keys "
        "'summary' (2-3 sentences), "
        "'suggested_title' (concise filename, no extension, underscores OK), "
        "'suggested_tags' (list of 2-3 short category strings)."
    )
    user = f"Here is the file content or preview:\n{text}"
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.3,
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": user}],
    )
    # The assistant's content should be JSON; parse it.
    import json
    try:
        return json.loads(resp.choices[0].message.content.strip())
    except json.JSONDecodeError:
        # fallback: wrap whole text as 'summary' if parsing fails
        return {
            "summary": resp.choices[0].message.content.strip(),
            "suggested_title": "",
            "suggested_tags": [],
        }

def build_file_record(path: Path) -> dict:
    """Return the complete dict for one file, ready for Phase 3."""
    preview = preview_file(path)
    ai = gpt_summarize_and_tag(preview)
    mime_type, _ = mimetypes.guess_type(str(path))
    return {
        "filename": path.name,
        "filepath": str(path),
        "filetype": path.suffix.lstrip(".").lower(),
        "mime_type": mime_type or "",
        "preview": preview,
        **ai,  # merges summary / suggested_title / suggested_tags
    }

if __name__ == "__main__":
    file_info_list = [build_file_record(p) for p in list_top_level_files(DOCUMENTS_DIR)]

    # Pretty-print to console
    for info in file_info_list:
        print(json.dumps(info, indent=2))
        print("-" * 70)

    # Optional: persist to JSON for Phase 3
    out_path = DOCUMENTS_DIR / "phase2_file_info.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(file_info_list, f, indent=2)
    print(f"\n✔︎ Saved structured data for {len(file_info_list)} files → {out_path}")


{
  "filename": ".RData",
  "filepath": "C:\\Users\\gabel\\Documents\\.RData",
  "filetype": "",
  "mime_type": "",
  "preview": "[unknown file]",
  "summary": "The file content is unknown, and therefore no specific details can be provided. Further information is needed to summarize the content accurately.",
  "suggested_title": "unknown_file_content",
  "suggested_tags": [
    "unknown",
    "file",
    "content"
  ]
}
----------------------------------------------------------------------
{
  "filename": ".Rhistory",
  "filepath": "C:\\Users\\gabel\\Documents\\.Rhistory",
  "filetype": "",
  "mime_type": "",
  "preview": "[unknown file]",
  "summary": "The file content is not available for review or summarization. Please provide a valid file or content for analysis.",
  "suggested_title": "unknown_file",
  "suggested_tags": [
    "unknown",
    "file",
    "content"
  ]
}
----------------------------------------------------------------------
{
  "filename": "04.12.2023_22.06.45_REC.mp

In [None]:
#Phase 3 – move files to new location
import json
with open("C:\\Users\\gabel\\Documents\\phase2_file_info.json", "r", encoding="utf-8") as f:
    files_info = json.load(f)

def suggest_file_groups(files_info):
    file_descriptions = "\n".join(
        f"{i+1}. {f['filename']} — {f['summary']} [Tags: {', '.join(f['suggested_tags'])}]"
        for i, f in enumerate(files_info)
    )
    prompt = (
        "Here are files and their summaries:\n\n"
        f"{file_descriptions}\n\n"
        "Please suggest 2–5 folder names to group these files by purpose or topic. "
        "Then, assign each file number to a folder. Output format:\n\n"
        "Groups:\n- FolderName1: [1, 3, 5]\n- FolderName2: [2, 4]"
    )

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

suggested_folders = suggest_file_groups(files_info)


1. .RData — The file content is unknown, and therefore no specific details can be provided. Further information is needed to summarize the content accurately. [Tags: unknown, file, content]
2. .Rhistory — The file content is not available for review or summarization. Please provide a valid file or content for analysis. [Tags: unknown, file, content]
3. 04.12.2023_22.06.45_REC.mp4 — The file is a video in MP4 format. It may contain visual and audio content suitable for various purposes such as entertainment, education, or documentation. [Tags: video, mp4, media]
4. 6824.txt — The writer expresses deep feelings of despair and stagnation in life, feeling disconnected and overwhelmed by their current situation. They reflect on their struggles with addiction, family sadness, and the recent loss of a beloved pet, Charlotte. This emotional turmoil leads to a sense of searching for purpose and tranquility amidst chaos. [Tags: mental_health, personal_reflection, grief]
5. best_model.pth — The f

In [None]:
import re
from collections import defaultdict

def parse_group_response(response_text):
    folder_map = defaultdict(list)
    lines = response_text.splitlines()
    for line in lines:
        match = re.match(r"- (.+?): \[(.+?)\]", line.strip())
        if match:
            folder, indices = match.groups()
            indices = [int(i.strip()) - 1 for i in indices.split(",")]
            folder_map[folder].extend(indices)
    return folder_map

folder_map = parse_group_response(suggested_folders)


def move_files_to_groups(files_info, group_map):
    for folder, indices in group_map.items():
        target_dir = DOCUMENTS_DIR / folder
        target_dir.mkdir(exist_ok=True)
        for i in indices:
            src_path = Path(files_info[i]["filepath"])
            dest_path = target_dir / src_path.name
            shutil.move(str(src_path), str(dest_path))
            print(f"Moved {src_path.name} → {target_dir}")

move_files_to_groups(files_info, folder_map)

defaultdict(<class 'list'>, {'Multimedia_Content': [2, 10, 13, 21, 27, 28, 36, 42], 'Data_Analysis_Scripts': [12, 5, 6, 23, 37, 39, 40, 43, 46], 'Machine_Learning_Models': [14, 30, 31, 32, 33, 34, 35], 'Textual_Document_Analysis': [3, 17, 18, 19, 24, 25, 26, 41, 44, 45], 'Unreadable_or_Unknown_Files': [0, 1, 4, 8, 11, 15, 20, 22, 29, 38]})
[{'filename': '.RData', 'filepath': 'C:\\Users\\gabel\\Documents\\.RData', 'filetype': '', 'mime_type': '', 'preview': '[unknown file]', 'summary': 'The file content is unknown, and therefore no specific details can be provided. Further information is needed to summarize the content accurately.', 'suggested_title': 'unknown_file_content', 'suggested_tags': ['unknown', 'file', 'content']}, {'filename': '.Rhistory', 'filepath': 'C:\\Users\\gabel\\Documents\\.Rhistory', 'filetype': '', 'mime_type': '', 'preview': '[unknown file]', 'summary': 'The file content is not available for review or summarization. Please provide a valid file or content for analys