Skip to content

Commit 4e6eb44

Browse files
nickpnickp
authored andcommitted
97%
1 parent 86bda40 commit 4e6eb44

File tree

1,657 files changed

+207779
-21618
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,657 files changed

+207779
-21618
lines changed

.eleventy.js

Lines changed: 78 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,21 @@ module.exports = function(eleventyConfig) {
1919
console.log('ℹ️ No dedupe.json found - entities will not be deduplicated');
2020
}
2121

22+
// Load document type deduplication mappings if available
23+
let typeDedupeMap = {};
24+
const typeDedupeFile = path.join(__dirname, 'dedupe_types.json');
25+
if (fs.existsSync(typeDedupeFile)) {
26+
try {
27+
const data = JSON.parse(fs.readFileSync(typeDedupeFile, 'utf8'));
28+
typeDedupeMap = data.mappings || {};
29+
console.log('✅ Loaded document type mappings from dedupe_types.json');
30+
} catch (e) {
31+
console.warn('⚠️ Could not load dedupe_types.json:', e.message);
32+
}
33+
} else {
34+
console.log('ℹ️ No dedupe_types.json found - document types will not be deduplicated');
35+
}
36+
2237
// Helper function to apply deduplication mapping
2338
function applyDedupe(entityType, entityName) {
2439
if (!entityName) return entityName;
@@ -28,18 +43,24 @@ module.exports = function(eleventyConfig) {
2843
// Helper function to normalize document types (for grouping)
2944
function normalizeDocType(docType) {
3045
if (!docType) return null;
31-
return String(docType).toLowerCase().trim();
46+
const trimmed = String(docType).trim();
47+
48+
// Apply deduplication mapping if available
49+
const canonical = typeDedupeMap[trimmed] || trimmed;
50+
51+
return canonical.toLowerCase().trim();
3252
}
3353

3454
// Helper function to format document types for display (title case)
3555
function formatDocType(docType) {
3656
if (!docType) return 'Unknown';
37-
return String(docType)
38-
.toLowerCase()
39-
.trim()
40-
.split(' ')
41-
.map(word => word.charAt(0).toUpperCase() + word.slice(1))
42-
.join(' ');
57+
const trimmed = String(docType).trim();
58+
59+
// Apply deduplication mapping if available
60+
const canonical = typeDedupeMap[trimmed] || trimmed;
61+
62+
// Return the canonical name (already in proper case from dedupe script)
63+
return canonical;
4364
}
4465

4566
// Helper function to normalize dates to consistent format
@@ -301,8 +322,21 @@ module.exports = function(eleventyConfig) {
301322
if (fs.existsSync(analysesFile)) {
302323
try {
303324
const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
304-
console.log(`✅ Loaded ${data.analyses?.length || 0} document analyses`);
305-
return data.analyses || [];
325+
const analyses = data.analyses || [];
326+
327+
// Apply document type deduplication to analyses
328+
if (Object.keys(typeDedupeMap).length > 0) {
329+
analyses.forEach(analysis => {
330+
if (analysis.analysis?.document_type) {
331+
const original = analysis.analysis.document_type;
332+
const canonical = typeDedupeMap[original] || original;
333+
analysis.analysis.document_type = canonical;
334+
}
335+
});
336+
}
337+
338+
console.log(`✅ Loaded ${analyses.length} document analyses`);
339+
return analyses;
306340
} catch (e) {
307341
console.warn('⚠️ Could not load analyses.json:', e.message);
308342
return [];
@@ -312,6 +346,41 @@ module.exports = function(eleventyConfig) {
312346
return [];
313347
});
314348

349+
// Get unique canonical document types from analyses
350+
eleventyConfig.addGlobalData("analysisDocumentTypes", () => {
351+
const analysesFile = path.join(__dirname, 'analyses.json');
352+
if (!fs.existsSync(analysesFile)) {
353+
return [];
354+
}
355+
356+
try {
357+
const data = JSON.parse(fs.readFileSync(analysesFile, 'utf8'));
358+
const analyses = data.analyses || [];
359+
360+
// Collect unique canonical types
361+
const typesSet = new Set();
362+
analyses.forEach(analysis => {
363+
if (analysis.analysis?.document_type) {
364+
let docType = analysis.analysis.document_type;
365+
366+
// Apply deduplication if available
367+
if (Object.keys(typeDedupeMap).length > 0) {
368+
docType = typeDedupeMap[docType] || docType;
369+
}
370+
371+
typesSet.add(docType);
372+
}
373+
});
374+
375+
const uniqueTypes = Array.from(typesSet).sort();
376+
console.log(`✅ Found ${uniqueTypes.length} unique canonical document types for filters`);
377+
return uniqueTypes;
378+
} catch (e) {
379+
console.warn('⚠️ Could not load document types:', e.message);
380+
return [];
381+
}
382+
});
383+
315384
// Add global data - load all pages and group into documents
316385
eleventyConfig.addGlobalData("documents", getDocuments);
317386

README.md

Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,15 @@ This project automatically processes thousands of scanned document pages using A
3434
├── process_images.py # Python script to OCR images using AI
3535
├── cleanup_failed.py # Python script to clean up failed processing
3636
├── deduplicate.py # Python script to deduplicate entities
37+
├── deduplicate_types.py # Python script to deduplicate document types
3738
├── analyze_documents.py # Python script to generate AI summaries
3839
├── requirements.txt # Python dependencies
3940
├── .env.example # Example environment configuration
4041
├── downloads/ # Place document images here
4142
├── results/ # Extracted JSON data per document
4243
├── processing_index.json # Processing progress tracking (generated)
4344
├── dedupe.json # Entity deduplication mappings (generated)
45+
├── dedupe_types.json # Document type deduplication mappings (generated)
4446
├── analyses.json # AI document analyses (generated)
4547
├── src/ # 11ty source files for website
4648
├── .eleventy.js # Static site generator configuration
@@ -133,6 +135,37 @@ This will:
133135
}
134136
```
135137

138+
**Deduplicate Document Types:**
139+
140+
The LLM may also extract document types with inconsistent formatting (e.g., "deposition", "Deposition", "DEPOSITION TRANSCRIPT"). Run the type deduplication script:
141+
142+
```bash
143+
python deduplicate_types.py
144+
```
145+
146+
This will:
147+
- Collect all document types from `./results/`
148+
- Use AI to merge similar types into canonical forms
149+
- Create a `dedupe_types.json` mapping file
150+
- The website build will automatically use this mapping
151+
152+
**Example dedupe_types.json:**
153+
```json
154+
{
155+
"stats": {
156+
"original_types": 45,
157+
"canonical_types": 12,
158+
"reduction_percentage": 73.3
159+
},
160+
"mappings": {
161+
"deposition": "Deposition",
162+
"DEPOSITION": "Deposition",
163+
"deposition transcript": "Deposition",
164+
"court filing": "Court Filing"
165+
}
166+
}
167+
```
168+
136169
### 5. Analyze Documents (Optional but Recommended)
137170

138171
Generate AI summaries and insights for each document:
@@ -209,32 +242,18 @@ This is an open archive project. Contributions welcome:
209242
- Add additional document sources
210243
- Improve entity extraction
211244

212-
## Support This Project
213-
214-
If you find this archive useful, consider supporting its maintenance and hosting:
215-
216-
**Bitcoin**: `bc1qmahlh5eql05w30cgf5taj3n23twmp0f5xcvnnz`
217-
218245
## Deployment
219246

220247
The site is automatically deployed to GitHub Pages on every push to the main branch.
221248

222249
### GitHub Pages Setup
223250

224-
1. Push this repository to GitHub: `https://github.com/epstein-docs/epstein-docs`
251+
1. Push this repository to GitHub: `https://github.com/epstein-docs/epstein-docs.github.io`
225252
2. Go to Settings → Pages
226253
3. Source: GitHub Actions
227254
4. The workflow will automatically build and deploy the site
228255

229-
The site will be available at: `https://epstein-docs.github.io/epstein-docs/`
230-
231-
## License
232-
233-
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
234-
235-
The code in this repository is open source and free to use. The documents themselves are public records.
236-
237-
**Repository**: https://github.com/epstein-docs/epstein-docs
256+
The site will be available at: `https://epstein-docs.github.io/`
238257

239258
## Future: Relationship Graphs
240259

@@ -278,3 +297,17 @@ The deduplication step is essential for accurate relationship mapping - without
278297
## Disclaimer
279298

280299
This is an independent archival project. Documents are sourced from public releases. The maintainers make no representations about completeness or accuracy of the archive.
300+
301+
## License
302+
303+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
304+
305+
The code in this repository is open source and free to use. The documents themselves are public records.
306+
307+
**Repository**: https://github.com/epstein-docs/epstein-docs
308+
309+
## Support This Project
310+
311+
If you find this archive useful, consider supporting its maintenance and hosting:
312+
313+
**Bitcoin**: `bc1qmahlh5eql05w30cgf5taj3n23twmp0f5xcvnnz`

0 commit comments

Comments
 (0)