epstein-docs
diff --git a/‎.eleventy.js‎
Lines changed: 78 additions & 9 deletions b/‎.eleventy.js‎
Lines changed: 78 additions & 9 deletions
diff --git a/‎README.md‎
Lines changed: 49 additions & 16 deletions b/‎README.md‎
Lines changed: 49 additions & 16 deletions
@@ -19,6 +19,21 @@ module.exports = function(eleventyConfig) {
     console.log('ℹ️  No dedupe.json found - entities will not be deduplicated');
   }
 
+  // Load document type deduplication mappings if available
+  let typeDedupeMap = {};
+  const typeDedupeFile = path.join(__dirname, 'dedupe_types.json');
+  if (fs.existsSync(typeDedupeFile)) {
+    try {
+      const data = JSON.parse(fs.readFileSync(typeDedupeFile, 'utf8'));
+      typeDedupeMap = data.mappings || {};
+      console.log('✅ Loaded document type mappings from dedupe_types.json');
+    } catch (e) {
+      console.warn('⚠️  Could not load dedupe_types.json:', e.message);
+    }
+  } else {
+    console.log('ℹ️  No dedupe_types.json found - document types will not be deduplicated');
+  }
+
   // Helper function to apply deduplication mapping
   function applyDedupe(entityType, entityName) {
     if (!entityName) return entityName;
@@ -28,18 +43,24 @@ module.exports = function(eleventyConfig) {
   // Helper function to normalize document types (for grouping)
   function normalizeDocType(docType) {
     if (!docType) return null;
-    return String(docType).toLowerCase().trim();
+    const trimmed = String(docType).trim();
+
+    // Apply deduplication mapping if available
+    const canonical = typeDedupeMap[trimmed] || trimmed;
+
+    return canonical.toLowerCase().trim();
   }
 
   // Helper function to format document types for display (title case)
   function formatDocType(docType) {
     if (!docType) return 'Unknown';
-    return String(docType)
-      .toLowerCase()
-      .trim()
-      .split(' ')
-      .map(word => word.charAt(0).toUpperCase() + word.slice(1))
-      .join(' ');
+    const trimmed = String(docType).trim();
+
+    // Apply deduplication mapping if available
+    const canonical = typeDedupeMap[trimmed] || trimmed;
+
+    // Return the canonical name (already in proper case from dedupe script)
+    return canonical;
   }
 
   // Helper function to normalize dates to consistent format
@@ -301,8 +322,21 @@ module.exports = function(eleventyConfig) {
     if (fs.existsSync(analysesFile)) {
       try {
         const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
-        console.log(`✅ Loaded ${data.analyses?.length || 0} document analyses`);
-        return data.analyses || [];
+        const analyses = data.analyses || [];
+
+        // Apply document type deduplication to analyses
+        if (Object.keys(typeDedupeMap).length > 0) {
+          analyses.forEach(analysis => {
+            if (analysis.analysis?.document_type) {
+              const original = analysis.analysis.document_type;
+              const canonical = typeDedupeMap[original] || original;
+              analysis.analysis.document_type = canonical;
+            }
+          });
+        }
+
+        console.log(`✅ Loaded ${analyses.length} document analyses`);
+        return analyses;
       } catch (e) {
         console.warn('⚠️  Could not load analyses.json:', e.message);
         return [];
@@ -312,6 +346,41 @@ module.exports = function(eleventyConfig) {
     return [];
   });
 
+  // Get unique canonical document types from analyses
+  eleventyConfig.addGlobalData("analysisDocumentTypes", () => {
+    const analysesFile = path.join(__dirname, 'analyses.json');
+    if (!fs.existsSync(analysesFile)) {
+      return [];
+    }
+
+    try {
+      const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
+      const analyses = data.analyses || [];
+
+      // Collect unique canonical types
+      const typesSet = new Set();
+      analyses.forEach(analysis => {
+        if (analysis.analysis?.document_type) {
+          let docType = analysis.analysis.document_type;
+
+          // Apply deduplication if available
+          if (Object.keys(typeDedupeMap).length > 0) {
+            docType = typeDedupeMap[docType] || docType;
+          }
+
+          typesSet.add(docType);
+        }
+      });
+
+      const uniqueTypes = Array.from(typesSet).sort();
+      console.log(`✅ Found ${uniqueTypes.length} unique canonical document types for filters`);
+      return uniqueTypes;
+    } catch (e) {
+      console.warn('⚠️  Could not load document types:', e.message);
+      return [];
+    }
+  });
+
   // Add global data - load all pages and group into documents
   eleventyConfig.addGlobalData("documents", getDocuments);
 
 
@@ -34,13 +34,15 @@ This project automatically processes thousands of scanned document pages using A
 ├── process_images.py       # Python script to OCR images using AI
 ├── cleanup_failed.py       # Python script to clean up failed processing
 ├── deduplicate.py          # Python script to deduplicate entities
+├── deduplicate_types.py    # Python script to deduplicate document types
 ├── analyze_documents.py    # Python script to generate AI summaries
 ├── requirements.txt         # Python dependencies
 ├── .env.example            # Example environment configuration
 ├── downloads/              # Place document images here
 ├── results/                # Extracted JSON data per document
 ├── processing_index.json   # Processing progress tracking (generated)
 ├── dedupe.json             # Entity deduplication mappings (generated)
+├── dedupe_types.json       # Document type deduplication mappings (generated)
 ├── analyses.json           # AI document analyses (generated)
 ├── src/                    # 11ty source files for website
 ├── .eleventy.js            # Static site generator configuration
@@ -133,6 +135,37 @@ This will:
 }
 ```
 
+**Deduplicate Document Types:**
+
+The LLM may also extract document types with inconsistent formatting (e.g., "deposition", "Deposition", "DEPOSITION TRANSCRIPT"). Run the type deduplication script:
+
+```bash
+python deduplicate_types.py
+```
+
+This will:
+- Collect all document types from `./results/`
+- Use AI to merge similar types into canonical forms
+- Create a `dedupe_types.json` mapping file
+- The website build will automatically use this mapping
+
+**Example dedupe_types.json:**
+```json
+{
+  "stats": {
+    "original_types": 45,
+    "canonical_types": 12,
+    "reduction_percentage": 73.3
+  },
+  "mappings": {
+    "deposition": "Deposition",
+    "DEPOSITION": "Deposition",
+    "deposition transcript": "Deposition",
+    "court filing": "Court Filing"
+  }
+}
+```
+
 ### 5. Analyze Documents (Optional but Recommended)
 
 Generate AI summaries and insights for each document:
@@ -209,32 +242,18 @@ This is an open archive project. Contributions welcome:
 - Add additional document sources
 - Improve entity extraction
 
-## Support This Project
-
-If you find this archive useful, consider supporting its maintenance and hosting:
-
-**Bitcoin**: `bc1qmahlh5eql05w30cgf5taj3n23twmp0f5xcvnnz`
-
 ## Deployment
 
 The site is automatically deployed to GitHub Pages on every push to the main branch.
 
 ### GitHub Pages Setup
 
-1. Push this repository to GitHub: `https://github.com/epstein-docs/epstein-docs`
+1. Push this repository to GitHub: `https://github.com/epstein-docs/epstein-docs.github.io`
 2. Go to Settings → Pages
 3. Source: GitHub Actions
 4. The workflow will automatically build and deploy the site
 
-The site will be available at: `https://epstein-docs.github.io/epstein-docs/`
-
-## License
-
-This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
-
-The code in this repository is open source and free to use. The documents themselves are public records.
-
-**Repository**: https://github.com/epstein-docs/epstein-docs
+The site will be available at: `https://epstein-docs.github.io/`
 
 ## Future: Relationship Graphs
 
@@ -278,3 +297,17 @@ The deduplication step is essential for accurate relationship mapping - without
 ## Disclaimer
 
 This is an independent archival project. Documents are sourced from public releases. The maintainers make no representations about completeness or accuracy of the archive.
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+The code in this repository is open source and free to use. The documents themselves are public records.
+
+**Repository**: https://github.com/epstein-docs/epstein-docs
+
+## Support This Project
+
+If you find this archive useful, consider supporting its maintenance and hosting:
+
+**Bitcoin**: `bc1qmahlh5eql05w30cgf5taj3n23twmp0f5xcvnnz`