In [None]:
import os
from pathlib import Path
import json
import mimetypes
import fitz # PyMuPDF
import pandas as pd
import docx
from collections import defaultdict

DOCUMENTS_DIR = Path(r"C:\Users\gabel\OneDrive - Virginia Tech") #Set path to the directory you want to search
EXPECTED_EXTENSIONS = {
    ".pdf", ".txt", ".py", ".md", ".ipynb", ".rmd", ".r", ".csv",
    ".xlsx", ".xls", ".docx"
}


def is_hidden(path: Path) -> bool:
    """Return True if *any* part of the path (file or folder) starts with a dot."""
    return any(part.startswith(".") for part in path.parts)

def list_files_and_dirs(root: Path):
    """
    Recursively yield files and directories under `root`,
    skipping hidden paths and unwanted extensions for files.
    Returns lists of (full_path, is_dir).
    """
    all_items = []
    for p in root.rglob("*"):
        if is_hidden(p):
            continue

        if p.is_file():
            if p.suffix.lower() in EXPECTED_EXTENSIONS:
                all_items.append((p, False)) # (full_path, is_file)
        elif p.is_dir():
            all_items.append((p, True)) # (full_path, is_dir)
    return all_items

def getting_info_excel_csv_files(full_path: Path) -> str:
    """Return a textual preview/placeholder for Excel and CSV files."""
    try:
        if full_path.suffix.lower() in {".xlsx", ".xls"}:
            df = pd.read_excel(full_path)
        elif full_path.suffix.lower() == ".csv":
            df = pd.read_csv(full_path)
        else:
            return "[Unknown file type]"

        preview = df.head().to_string(index=False)
        col_names = df.columns.tolist()
        row_names = df.index.tolist()
        preview = f"Columns: {', '.join(col_names)}\nRows: {', '.join(map(str, row_names))}\n{preview}"
        return preview[:2000]

    except Exception as exc:
        return f"[Error reading file: {exc}]"

def get_info_from_docx(full_path: Path) -> str:
    """Return a textual preview/placeholder for Word documents."""
    try:
        doc = docx.Document(full_path)
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        return "\n".join(paragraphs)[:2000]
    except Exception as exc:
        return f"[Error reading file: {exc}]"

def get_file_preview(path: Path, max_chars: int = 2000) -> str:
    """Return a textual preview/placeholder for arbitrary file types."""
    try:
        if path.suffix.lower() == ".ipynb":
            with open(path, "r", encoding="utf-8") as f:
                nb = json.load(f)
            cells = [
                ("[Markdown]" if c["cell_type"] == "markdown" else "[Code]") +
                "\n" + "".join(c["source"]).strip()
                for c in nb.get("cells", [])
                if c["cell_type"] in {"markdown", "code"}
            ]
            return "\n\n".join(cells)[:max_chars]

        elif path.suffix.lower() in {".xlsx", ".xls", ".csv"}:
            return getting_info_excel_csv_files(path)
        elif path.suffix.lower() == ".docx":
            return get_info_from_docx(path)

        mime, _ = mimetypes.guess_type(str(path))
        if mime and mime.startswith("text"):
            return path.read_text(encoding="utf-8", errors="ignore")[:max_chars]

        if mime == "application/pdf":
            with fitz.open(path) as doc:
                return doc[0].get_text()[:max_chars]

        return f"[{mime or 'unknown'} file]"

    except Exception as exc:
        return f"[Error reading file: {exc}]"



def build_full_nested_directory_structure(root_dir: Path):
    """
    Builds a single nested dictionary representing the entire directory structure,
    including files and their attributes.

    Args:
        root_dir (Path): The absolute path to the root directory to scan.

    Returns:
        dict: A single nested dictionary representing the directory tree,
              with file attributes at the leaf nodes.
    """
    # Initialize the main dictionary with the root directory name as the top key
    # Handle cases where root_dir might be a drive letter
    root_name = root_dir.name if root_dir.name else str(root_dir)
    full_structure = {root_name: {}}

    # Iterate through all items (files and directories) within the root_dir
    # sorted by path length to ensure parent directories are added before children
    all_items = sorted(list_files_and_dirs(root_dir), key=lambda x: len(x[0].parts))

    for item_path, is_directory in all_items:
        try:
            # Get the path relative to the root_dir
            relative_path_str = os.path.relpath(str(item_path), str(root_dir))
            # Split the relative path into components (keys)
            # Filter out empty strings which can result from split on leading/trailing slashes
            path_components = [part for part in Path(relative_path_str).parts if part]

            if not path_components: # This can happen if item_path is the root_dir itself, which we already handled
                continue

            current_level = full_structure[root_name]

            # Traverse or create nested dictionaries for directory parts
            for i, component in enumerate(path_components):
                if i == len(path_components) - 1: # This is the last component of the path
                    if is_directory:
                        # If it's a directory, ensure it's a dictionary
                        if component not in current_level:
                            current_level[component] = {}
                        # If it exists and is a file, this would be an error, but our logic prevents it
                    else:
                        # If it's a file, assign its attributes
                        preview = get_file_preview(item_path)
                        current_level[component] = {
                            "filename": item_path.name,
                            "filepath": str(item_path),
                            "filetype": item_path.suffix.lstrip(".").lower(),
                            "mime_type": mimetypes.guess_type(str(item_path))[0] or "",
                            "preview": preview,
                            "summary": preview[:100],
                            "is_directory": False # Explicitly state it's a file
                        }
                else:
                    # If it's a directory part, ensure it's a dictionary for nesting
                    if component not in current_level:
                        current_level[component] = {}
                    current_level = current_level[component] # Move deeper

        except ValueError as e:
            # This can happen if item_path is not truly a subpath of root_dir (e.g., different drive)
            print(f"Warning: Could not process {item_path} relative to {root_dir}. Error: {e}")
        except Exception as e:
            print(f"An unexpected error occurred processing {item_path}: {e}")

    return full_structure

# --- Main Execution ---
if __name__ == "__main__":
    print(f"--- Scanning directory: {DOCUMENTS_DIR} ---")
    nested_directory_data = build_full_nested_directory_structure(DOCUMENTS_DIR)

    print("\n--- Generated Nested Directory Structure (JSON Output) ---")
    # Use json.dumps for pretty printing the dictionary
    output_json= DOCUMENTS_DIR / "full_data_file_info.json"
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(nested_directory_data, f, indent=4, ensure_ascii=False)
    print(f"Output saved to: {output_json}")
    #print(json.dumps(nested_directory_data, indent=4))

--- Scanning directory: C:\Users\gabel\OneDrive - Virginia Tech ---


  df = pd.read_csv(full_path)



--- Generated Nested Directory Structure (JSON Output) ---
Output saved to: C:\Users\gabel\OneDrive - Virginia Tech\full_data_file_info.json


In [27]:
import os
from dotenv import load_dotenv
from openai import OpenAI

#Load the OpenAI API key from the .env file
#Make sure to set the path to your .env file
dotenv_path = r"C:\Users\gabel\Documents\File Organizer AI\.env"
if not os.path.isfile(dotenv_path):
    raise FileNotFoundError(f"Missing .env at {dotenv_path}")
load_dotenv(dotenv_path)                     # <- pass explicit path
OPENAI_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_KEY:
    raise ValueError("Set OPENAI_API_KEY in your .env file")
client = OpenAI(api_key=OPENAI_KEY)

def gpt_summarize_and_tag(text: str) -> dict:
    """
    Ask GPT-4 to return JSON with keys 'summary', 'suggested_title', 'suggested_tags'.
    The model response is parsed and returned as a Python dict.
    """
    system = (
        "You are an assistant that summarizes files and suggests filenames & tags. "
        "Use the preview and summary text to create a concise summary, if this is not available, use the title to infer a summary."
        "Always reply **only** with valid JSON having keys "
        "'summary' (2-3 sentences), "
        "'suggested_title' (concise filename, no extension, underscores OK), "
        "'suggested_tags' (list of 3-5 short category strings)."
    )
    user = f"Here is the file content or preview:\n{text}"
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.3,
        messages=[{"role": "system", "content": system},
                  {"role": "user", "content": user}],
    )
    try:
        return json.loads(resp.choices[0].message.content.strip())
    except json.JSONDecodeError:
        # fallback: wrap whole text as 'summary' if parsing fails
        return {
            "summary": text[:200],  # fallback to first 200 chars
            "suggested_title": "",
            "suggested_tags": [],
        }
    
#Recurse through an arbitrary nested dict
def enrich_leaves_with_gpt(node, *, preview_key="preview"):
    """
    Recursively traverse a (possibly) nested dictionary.

    • If a sub-dict contains `preview_key`, we treat it as a file record,
      call gpt_summarize_and_tag(preview_text) and merge the result.
    • Otherwise, keep walking.

    The function mutates the original structure in-place and
    returns nothing.
    """
    if isinstance(node, dict):
        # Leaf condition: this dict already has a preview
        if preview_key in node:
            # --- call GPT only if we haven't done so yet --------------
            if not {"summary", "suggested_title", "suggested_tags"}.issubset(node):
                ai = gpt_summarize_and_tag(node[preview_key])
                node.update(ai)          # merge summary/title/tags in-place
        else:
            # Not a leaf → recurse into all values that are dicts
            for value in node.values():
                enrich_leaves_with_gpt(value, preview_key=preview_key)


#Loads Json file with all the data
with open(DOCUMENTS_DIR / "full_data_file_info.json", encoding="utf-8") as f:
    combined = json.load(f)

#Adds summary / title / tags to each leaf record
enrich_leaves_with_gpt(combined)     

#Write the updated data to a new file
with open(DOCUMENTS_DIR / "full_data_with_ai.json", "w", encoding="utf-8") as f:
    json.dump(combined, f, indent=2)
print("Added GPT metadata to all leaf records")


Added GPT metadata to all leaf records


In [28]:
import itertools

# 1) Load the nested dictionary as necessary
with open(DOCUMENTS_DIR / "full_data_with_ai.json", encoding="utf-8") as f:
    nested = json.load(f)

# Flatten to a list of file records to be more easily accessible
def leaf_records(node):
    """Yield each dict that has a 'filename' key (leaf record)."""
    if isinstance(node, dict):
        if "filename" in node:         # <-- leaf condition
            yield node
        else:
            for v in node.values():
                yield from leaf_records(v)

files_info = list(leaf_records(nested))   #Now converted to a list, good for processing and complex queries


def suggest_file_groups(file_records):
    """Ask GPT to propose folder names + assignments for all records."""
    file_descriptions = "\n".join(
        f"{i+1}. {rec['filename']} — {rec['summary']} "
        f"[Tags: {', '.join(rec.get('suggested_tags', []))}]"
        for i, rec in enumerate(file_records)
    )

    prompt = (
        "Below are files with short summaries and tags.\n"
        "Suggest 3-5 folder names that logically group them. "
        "Then map each file number to a folder. "
        "Respond ONLY in this format:\n\n"
        "Groups:\n"
        "- <FolderName1>: [file#, file#, ...]\n"
        "- <FolderName2>: [...]\n\n"
        f"{file_descriptions}"
    )

    resp = client.chat.completions.create(
        model="gpt-4o-mini",           # or gpt-4o
        temperature=0.3,
        messages=[{"role": "user", "content": prompt}]
    )
    return resp.choices[0].message.content.strip()

#Call the function to get the suggested folder groupings
suggested_folders = suggest_file_groups(files_info)
print(suggested_folders)

Groups:
- Data Analysis: [1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18]
- Error Handling: [3, 19, 20, 21, 22, 23]
- Documentation: [17, 24, 25, 26, 27, 28, 29, 30]


In [31]:
import re
from collections import defaultdict
import shutil

def parse_group_response(response_text):
    folder_map = defaultdict(list)
    lines = response_text.splitlines()
    for line in lines:
        match = re.match(r"- (.+?): \[(.+?)\]", line.strip())
        if match:
            folder, indices = match.groups()
            indices = [int(i.strip()) - 1 for i in indices.split(",")]
            folder_map[folder].extend(indices)
    return folder_map

folder_map = parse_group_response(suggested_folders)

def move_files_to_folders(folder_map, files_info):
    for folder, indices in folder_map.items():
        target_dir = DOCUMENTS_DIR / folder
        target_dir.mkdir(exist_ok=True)
        for index in indices:
            file_info = files_info[index]
            if(file_info["is_directory"] == 0):
                # Move the file to the target directory
                old_path = DOCUMENTS_DIR / file_info['filepath']
                new_path = target_dir / file_info["filepath"]
                shutil.move(str(old_path), str(new_path))
                print(f"Moving {file_info['filename']} to {new_path}")
            else:
                old_path = file_info["filepath"]
                new_path = target_dir / file_info["filename"]
                shutil.move(str(old_path), str(new_path))
                print(f"Moving {file_info['filename']} to {new_path}")

# Call the function to move files based on the folder map
move_files_to_folders(folder_map, files_info)

Moving faers_2004_2024q1.csv to C:\Users\gabel\OneDrive - Virginia Tech\FAERS Research, Analysis and Paper\Data Analysis and Code\faers_2004_2024q1.csv
Moving FAERS_paper_code.ipynb to C:\Users\gabel\OneDrive - Virginia Tech\FAERS Research, Analysis and Paper\Data Analysis and Code\FAERS_paper_code.ipynb
Moving abn_behaviour_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\FAERS Research, Analysis and Paper\Drug Safety Analysis\abn_behaviour_disprortionality.csv
Moving aggression_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\FAERS Research, Analysis and Paper\Drug Safety Analysis\aggression_disprortionality.csv
Moving agitation_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\FAERS Research, Analysis and Paper\Drug Safety Analysis\agitation_disprortionality.csv
Moving anxiety_disprortionality.csv to C:\Users\gabel\OneDrive - Virginia Tech\FAERS Research, Analysis and Paper\Drug Safety Analysis\anxiety_disprortionality.csv
Moving compl_sui