In [None]:
import json, mimetypes
from pathlib import Path
import fitz               # PyMuPDF
from itertools import chain
from collections import defaultdict
import pandas as pd
import docx

DOCUMENTS_DIR = Path(r"C:\Users\gabel\OneDrive - Virginia Tech") #Set path to the directory you want to search
EXPECTED_EXTENSIONS = {
    ".pdf", ".txt", ".py", ".md", ".ipynb", ".rmd", ".r", ".csv",
    ".xlsx", ".xls", ".docx"
}


def is_hidden(path: Path) -> bool:
    """Return True if *any* part of the path (file or folder) starts with a dot."""
    return any(part.startswith(".") for part in path.parts)

def list_files(root: Path):
    """Recursively yield files under `root`, skipping hidden paths and unwanted extensions."""
    return [
        p for p in root.rglob("*")
        if (
            p.is_file()
            and not is_hidden(p)
            and p.suffix.lower() in EXPECTED_EXTENSIONS
        )
    ]

def getting_info_excel_csv_files(full_path: Path) -> str:
    """Return a textual preview/placeholder for Excel and CSV files."""
    try:
        if full_path.suffix.lower() in {".xlsx", ".xls"}:
            df = pd.read_excel(full_path)
        elif full_path.suffix.lower() == ".csv":
            df = pd.read_csv(full_path)
        else:
            return "[Unknown file type]"

        # Get the first 5 rows as a string
        preview = df.head().to_string(index=False)
        col_names = df.columns.tolist()
        row_names = df.index.tolist()
        preview = f"Columns: {', '.join(col_names)}\nRows: {', '.join(map(str, row_names))}\n{preview}"
        return preview[:2000]  # Limit to 2000 characters

    except Exception as exc:
        return f"[Error reading file: {exc}]"
    
def get_info_from_docx(full_path: Path) -> str:
    """Return a textual preview/placeholder for Word documents."""
    try:
        doc = docx.Document(full_path)
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        return "\n".join(paragraphs)[:2000]  # Limit to 2000 characters
    except Exception as exc:
        return f"[Error reading file: {exc}]"
 

def get_file_preview(path: Path, full_path ,max_chars: int = 2000) -> str:
    """Return a textual preview/placeholder for arbitrary file types."""
    try:
        if full_path.suffix.lower() == ".ipynb":
            with open(full_path, "r", encoding="utf-8") as f:
                nb = json.load(f)
            cells = [
                ("[Markdown]" if c["cell_type"] == "markdown" else "[Code]") +
                "\n" + "".join(c["source"]).strip()
                for c in nb.get("cells", [])
                if c["cell_type"] in {"markdown", "code"}
            ]
            return "\n\n".join(cells)[:max_chars]
        
        elif full_path.suffix.lower() in {".xlsx", ".xls", ".csv"}:
            return getting_info_excel_csv_files(full_path)
        elif full_path.suffix.lower() == ".docx":
            return get_info_from_docx(full_path)
        

        mime, _ = mimetypes.guess_type(str(path))
        if mime and mime.startswith("text"):
            return path.read_text(encoding="utf-8", errors="ignore")[:max_chars]

        if mime == "application/pdf":
            with fitz.open(full_path) as doc:
                return doc[0].get_text()[:max_chars]

        return f"[{mime or 'unknown'} file]"

    except Exception as exc:
        return f"[Error reading file: {exc}]"

file_data_no_subdirectories = {}

files = list_files(DOCUMENTS_DIR)
rel_paths = [file.relative_to(DOCUMENTS_DIR) for file in files]

#Split based on the parts of the path
#Get all of the paths where the parts are greater than 1
sub_paths = [p for p in rel_paths if len(p.parts) > 1]

#Update rel_paths to only include the paths that are not in sub_paths
rel_paths = [p for p in rel_paths if p not in sub_paths]

file_data_subdirectories = defaultdict(dict)

for rel_path in sub_paths:                     # rel_path is like  FolderA\proj\code.py
    full_path = DOCUMENTS_DIR / rel_path
    top_folder   = rel_path.parts[0]          
    inner_path   = Path(*rel_path.parts[1:])   
    
    preview = get_file_preview(full_path, full_path)
    
    file_data_subdirectories[top_folder][str(inner_path)] = {
        "filename" : full_path.name,
        "filepath" : str(full_path),
        "filetype" : full_path.suffix.lstrip(".").lower(),
        "mime_type": mimetypes.guess_type(str(full_path))[0] or "",
        "preview"  : preview,
        "summary"  : preview[:100],
        "subdirectory": 1
    }


#Split the list_files into parts where 
for file in rel_paths:
    full_path = DOCUMENTS_DIR / file
    
    preview  = get_file_preview(file, full_path)
    file_data_no_subdirectories[str(file)] = {
        "filename": file.name,
        "filepath": str(file),
        "filetype": file.suffix.lstrip(".").lower(),
        "mime_type": mimetypes.guess_type(str(file))[0] or "",
        "preview": preview,            # full preview (you can truncate later)
        "summary": preview[:100],       # quick stub summary for now
        "subdirectory": 0
        
    }

combined = {
    "no_subdirectories": file_data_no_subdirectories,
    "subdirectories"   : file_data_subdirectories
}

for value in combined.values():
    print(value)

with open(DOCUMENTS_DIR / "full_data_file_info.json", "w", encoding="utf-8") as f:
    json.dump(combined, f, indent=2)


  df = pd.read_csv(full_path)


{'abn_behaviour_disprortionality.csv': {'filename': 'abn_behaviour_disprortionality.csv', 'filepath': 'abn_behaviour_disprortionality.csv', 'filetype': 'csv', 'mime_type': 'application/vnd.ms-excel', 'preview': 'Columns: Drug, ROR & (95% CI), IC (95% CI)\nRows: 0, 1, 2, 3, 4, 5, 6\n        Drug        ROR & (95% CI)           IC (95% CI)\n Montelukast 17.81 (15.76 - 20.13)    1.94 (1.87 - 2.02)\n  Cetirizine    0.07 (0.04 - 0.12) -3.09 (-3.87 - -2.30)\nFexofenadine    0.14 (0.11 - 0.19) -2.53 (-2.91 - -2.15)\n  Loratadine    0.15 (0.13 - 0.19) -2.32 (-2.61 - -2.03)\n Zafirlukast     0.00 (0.00 - nan)     -inf (-inf - nan)', 'summary': 'Columns: Drug, ROR & (95% CI), IC (95% CI)\nRows: 0, 1, 2, 3, 4, 5, 6\n        Drug        ROR & (95% ', 'subdirectory': 0}, 'aggression_disprortionality.csv': {'filename': 'aggression_disprortionality.csv', 'filepath': 'aggression_disprortionality.csv', 'filetype': 'csv', 'mime_type': 'application/vnd.ms-excel', 'preview': 'Columns: Drug, ROR & (95% CI)

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI

#Load the OpenAI API key from the .env file
#Make sure to set the path to your .env file
dotenv_path = r"C:\Users\gabel\Documents\File Organizer AI\.env"
if not os.path.isfile(dotenv_path):
    raise FileNotFoundError(f"Missing .env at {dotenv_path}")
load_dotenv(dotenv_path)                     # <- pass explicit path
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_KEY:
    raise ValueError("Set OPENAI_API_KEY in your .env file")
client = OpenAI(api_key=OPENAI_KEY)

def gpt_summarize_and_tag(text: str) -> dict:
    """
    Ask GPT-4 to return JSON with keys 'summary', 'suggested_title', 'suggested_tags'.
    The model response is parsed and returned as a Python dict.
    """
    system = (
        "You are an assistant that summarizes files and suggests filenames & tags. "
        "Use the preview and summary text to create a concise summary, if this is not available, use the title to infer a summary."
        "Always reply **only** with valid JSON having keys "
        "'summary' (2-3 sentences), "
        "'suggested_title' (concise filename, no extension, underscores OK), "
        "'suggested_tags' (list of 3-5 short category strings)."
    )
    user = f"Here is the file content or preview:\n{text}"
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.3,
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": user}],
    )
    try:
        return json.loads(resp.choices[0].message.content.strip())
    except json.JSONDecodeError:
        # fallback: wrap whole text as 'summary' if parsing fails
        return {
            "summary": text[:200],  # fallback to first 200 chars
            "suggested_title": "",
            "suggested_tags": [],
        }
    
#Recurse through an arbitrary nested dict
def enrich_leaves_with_gpt(node, *, preview_key="preview"):
    """
    Recursively traverse a (possibly) nested dictionary.

    • If a sub-dict contains `preview_key`, we treat it as a file record,
      call gpt_summarize_and_tag(preview_text) and merge the result.
    • Otherwise, keep walking.

    The function mutates the original structure in-place and
    returns nothing.
    """
    if isinstance(node, dict):
        # Leaf condition: this dict already has a preview
        if preview_key in node:
            # --- call GPT only if we haven't done so yet --------------
            if not {"summary", "suggested_title", "suggested_tags"}.issubset(node):
                ai = gpt_summarize_and_tag(node[preview_key])
                node.update(ai)          # merge summary/title/tags in-place
        else:
            # Not a leaf → recurse into all values that are dicts
            for value in node.values():
                enrich_leaves_with_gpt(value, preview_key=preview_key)


#Loads Json file with all the data
with open(DOCUMENTS_DIR / "full_data_file_info.json", encoding="utf-8") as f:
    combined = json.load(f)

#Adds summary / title / tags to each leaf record
enrich_leaves_with_gpt(combined)     

#Write the updated data to a new file
with open(DOCUMENTS_DIR / "full_data_with_ai.json", "w", encoding="utf-8") as f:
    json.dump(combined, f, indent=2)
print("Added GPT metadata to all leaf records")


✔︎ Added GPT metadata to all leaf records


In [None]:
import itertools

# 1) Load the nested dictionary as necessary
with open(DOCUMENTS_DIR / "full_data_with_ai.json", encoding="utf-8") as f:
    nested = json.load(f)

# Flatten to a list of file records to be more easily accessible
def leaf_records(node):
    """Yield each dict that has a 'filename' key (leaf record)."""
    if isinstance(node, dict):
        if "filename" in node:         # <-- leaf condition
            yield node
        else:
            for v in node.values():
                yield from leaf_records(v)

files_info = list(leaf_records(nested))   #Now converted to a list, good for processing and complex queries


def suggest_file_groups(file_records):
    """Ask GPT to propose folder names + assignments for all records."""
    file_descriptions = "\n".join(
        f"{i+1}. {rec['filename']} — {rec['summary']} "
        f"[Tags: {', '.join(rec.get('suggested_tags', []))}]"
        for i, rec in enumerate(file_records)
    )

    prompt = (
        "Below are files with short summaries and tags.\n"
        "Suggest 3-5 folder names that logically group them. "
        "Then map each file number to a folder. "
        "Respond ONLY in this format:\n\n"
        "Groups:\n"
        "- <FolderName1>: [file#, file#, ...]\n"
        "- <FolderName2>: [...]\n\n"
        f"{file_descriptions}"
    )

    resp = client.chat.completions.create(
        model="gpt-4o-mini",           # or gpt-4o
        temperature=0.3,
        messages=[{"role": "user", "content": prompt}]
    )
    return resp.choices[0].message.content.strip()

#Call the function to get the suggested folder groupings
suggested_folders = suggest_file_groups(files_info)
print(suggested_folders)

Groups:
- Drug Safety Analysis: [1, 2, 3, 4, 5, 7, 9, 15, 17, 18, 19, 20, 27, 28, 29]
- Research and Evaluation Documents: [6, 8, 12, 13, 21, 22, 24, 25, 26]
- Data Analysis and Code: [10, 11, 30]
- Mental Health Resources: [14] 
- Error and Dependency Issues: [16, 23]


In [22]:
import re
from collections import defaultdict
import shutil

def parse_group_response(response_text):
    folder_map = defaultdict(list)
    lines = response_text.splitlines()
    for line in lines:
        match = re.match(r"- (.+?): \[(.+?)\]", line.strip())
        if match:
            folder, indices = match.groups()
            indices = [int(i.strip()) - 1 for i in indices.split(",")]
            folder_map[folder].extend(indices)
    return folder_map

folder_map = parse_group_response(suggested_folders)
print(folder_map)

def move_files_to_folders(folder_map, files_info):
    for folder, indices in folder_map.items():
        target_dir = DOCUMENTS_DIR / folder
        target_dir.mkdir(exist_ok=True)
        for index in indices:
            file_info = files_info[index]
            if(file_info["subdirectory"] == 0):
                # Move the file to the target directory
                old_path = DOCUMENTS_DIR / file_info['filename']
                new_path = target_dir / file_info["filename"]
                shutil.move(str(old_path), str(new_path))
                print(f"Moving {file_info['filename']} to {new_path}")
            else:
                old_path = file_info["filepath"]
                new_path = target_dir / file_info["filename"]
                shutil.move(str(old_path), str(new_path))
                print(f"Moving {file_info['filename']} to {new_path}")

# Call the function to move files based on the folder map
move_files_to_folders(folder_map, files_info)

defaultdict(<class 'list'>, {'Drug Safety Analysis': [0, 1, 2, 3, 4, 6, 8, 14, 16, 17, 18, 19, 26, 27, 28], 'Research and Evaluation Documents': [5, 7, 11, 12, 20, 21, 23, 24, 25], 'Data Analysis and Code': [9, 10, 29], 'Mental Health Resources': [13], 'Error and Dependency Issues': [15, 22]})
Moving abn_behaviour_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\Drug Safety Analysis\abn_behaviour_disprortionality.csv
Moving aggression_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\Drug Safety Analysis\aggression_disprortionality.csv
Moving agitation_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\Drug Safety Analysis\agitation_disprortionality.csv
Moving anger_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\Drug Safety Analysis\anger_disprortionality.csv
Moving anxiety_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\Drug Safety Analysis\anxiety_disprortionality.csv
Moving compl_suicide_disprortionality.csv 