Skip to content

Commit f63bc71

Browse files
nickpnickp
authored andcommitted
initial
0 parents  commit f63bc71

File tree

4,285 files changed

+329261
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

4,285 files changed

+329261
-0
lines changed

.eleventy.js

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
const fs = require('fs');
2+
const path = require('path');
3+
4+
module.exports = function(eleventyConfig) {
5+
// Copy results directory to output
6+
eleventyConfig.addPassthroughCopy({ "./results": "documents" });
7+
8+
// Cache the documents data - only compute once
9+
let cachedDocuments = null;
10+
11+
function getDocuments() {
12+
if (cachedDocuments) {
13+
return cachedDocuments;
14+
}
15+
const resultsDir = path.join(__dirname, './results');
16+
const pages = [];
17+
18+
function readDocuments(dir, relativePath = '') {
19+
const entries = fs.readdirSync(dir, { withFileTypes: true });
20+
21+
for (const entry of entries) {
22+
const fullPath = path.join(dir, entry.name);
23+
const relPath = path.join(relativePath, entry.name);
24+
25+
if (entry.isDirectory()) {
26+
readDocuments(fullPath, relPath);
27+
} else if (entry.name.endsWith('.json')) {
28+
try {
29+
const content = JSON.parse(fs.readFileSync(fullPath, 'utf8'));
30+
pages.push({
31+
path: relPath,
32+
filename: entry.name.replace('.json', ''),
33+
folder: relativePath || 'root',
34+
...content
35+
});
36+
} catch (e) {
37+
console.error(`Error reading ${fullPath}:`, e.message);
38+
}
39+
}
40+
}
41+
}
42+
43+
readDocuments(resultsDir);
44+
45+
// Normalize function to handle LLM inconsistencies in document numbers
46+
const normalizeDocNum = (docNum) => {
47+
if (!docNum) return null;
48+
// Convert to lowercase, remove all non-alphanumeric except hyphens, collapse multiple hyphens
49+
return String(docNum)
50+
.toLowerCase()
51+
.replace(/[^a-z0-9-]/g, '-')
52+
.replace(/-+/g, '-')
53+
.replace(/^-+|-+$/g, '');
54+
};
55+
56+
// Group pages by NORMALIZED document_number to handle LLM variations
57+
const documentMap = new Map();
58+
59+
pages.forEach(page => {
60+
// Use document_number from metadata to group pages of the same document
61+
const rawDocNum = page.document_metadata?.document_number;
62+
63+
// Skip pages without a document number
64+
if (!rawDocNum) {
65+
console.warn(`Page ${page.filename} has no document_number, using filename as fallback`);
66+
const fallbackKey = normalizeDocNum(page.filename) || page.filename;
67+
if (!documentMap.has(fallbackKey)) {
68+
documentMap.set(fallbackKey, []);
69+
}
70+
documentMap.get(fallbackKey).push(page);
71+
return;
72+
}
73+
74+
// Normalize the document number to group variants together
75+
const normalizedDocNum = normalizeDocNum(rawDocNum);
76+
77+
if (!documentMap.has(normalizedDocNum)) {
78+
documentMap.set(normalizedDocNum, []);
79+
}
80+
documentMap.get(normalizedDocNum).push(page);
81+
});
82+
83+
// Convert to array and sort pages within each document
84+
const documents = Array.from(documentMap.entries()).map(([normalizedDocNum, docPages]) => {
85+
86+
// Sort pages by page number
87+
docPages.sort((a, b) => {
88+
const pageA = parseInt(a.document_metadata?.page_number) || 0;
89+
const pageB = parseInt(b.document_metadata?.page_number) || 0;
90+
return pageA - pageB;
91+
});
92+
93+
// Combine all entities from all pages
94+
const allEntities = {
95+
people: new Set(),
96+
organizations: new Set(),
97+
locations: new Set(),
98+
dates: new Set(),
99+
reference_numbers: new Set()
100+
};
101+
102+
docPages.forEach(page => {
103+
if (page.entities) {
104+
Object.keys(allEntities).forEach(key => {
105+
if (page.entities[key]) {
106+
page.entities[key].forEach(item => allEntities[key].add(item));
107+
}
108+
});
109+
}
110+
});
111+
112+
// Get metadata from first page
113+
const firstPage = docPages[0];
114+
115+
// Get all unique folders that contain pages of this document
116+
const folders = [...new Set(docPages.map(p => p.folder))];
117+
118+
// Get all unique raw document numbers (for display)
119+
const rawDocNums = [...new Set(docPages.map(p => p.document_metadata?.document_number).filter(Boolean))];
120+
121+
return {
122+
unique_id: normalizedDocNum, // Normalized version for unique URLs
123+
document_number: rawDocNums.length === 1 ? rawDocNums[0] : normalizedDocNum, // Show original if consistent, else normalized
124+
raw_document_numbers: rawDocNums, // All variations found
125+
pages: docPages,
126+
page_count: docPages.length,
127+
document_metadata: firstPage.document_metadata,
128+
entities: {
129+
people: Array.from(allEntities.people),
130+
organizations: Array.from(allEntities.organizations),
131+
locations: Array.from(allEntities.locations),
132+
dates: Array.from(allEntities.dates),
133+
reference_numbers: Array.from(allEntities.reference_numbers)
134+
},
135+
full_text: docPages.map(p => p.full_text).join('\n\n--- PAGE BREAK ---\n\n'),
136+
folder: folders.join(', '), // Show all folders if document spans multiple
137+
folders: folders // Keep array for reference
138+
};
139+
});
140+
141+
cachedDocuments = documents;
142+
return documents;
143+
}
144+
145+
// Add global data - load all pages and group into documents
146+
eleventyConfig.addGlobalData("documents", getDocuments);
147+
148+
// Build indices from grouped documents
149+
eleventyConfig.addGlobalData("indices", () => {
150+
const documentsData = getDocuments();
151+
152+
const people = new Map();
153+
const organizations = new Map();
154+
const locations = new Map();
155+
const dates = new Map();
156+
const documentTypes = new Map();
157+
158+
documentsData.forEach(doc => {
159+
// People
160+
if (doc.entities?.people) {
161+
doc.entities.people.forEach(person => {
162+
if (!people.has(person)) people.set(person, []);
163+
people.get(person).push(doc);
164+
});
165+
}
166+
167+
// Organizations
168+
if (doc.entities?.organizations) {
169+
doc.entities.organizations.forEach(org => {
170+
if (!organizations.has(org)) organizations.set(org, []);
171+
organizations.get(org).push(doc);
172+
});
173+
}
174+
175+
// Locations
176+
if (doc.entities?.locations) {
177+
doc.entities.locations.forEach(loc => {
178+
if (!locations.has(loc)) locations.set(loc, []);
179+
locations.get(loc).push(doc);
180+
});
181+
}
182+
183+
// Dates
184+
if (doc.entities?.dates) {
185+
doc.entities.dates.forEach(date => {
186+
if (!dates.has(date)) dates.set(date, []);
187+
dates.get(date).push(doc);
188+
});
189+
}
190+
191+
// Document types
192+
const docType = doc.document_metadata?.document_type;
193+
if (docType) {
194+
if (!documentTypes.has(docType)) documentTypes.set(docType, []);
195+
documentTypes.get(docType).push(doc);
196+
}
197+
});
198+
199+
return {
200+
people: Array.from(people.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
201+
organizations: Array.from(organizations.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
202+
locations: Array.from(locations.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
203+
dates: Array.from(dates.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count),
204+
documentTypes: Array.from(documentTypes.entries()).map(([name, docs]) => ({ name, docs, count: docs.length })).sort((a, b) => b.count - a.count)
205+
};
206+
});
207+
208+
return {
209+
dir: {
210+
input: "src",
211+
output: "_site",
212+
includes: "_includes"
213+
},
214+
pathPrefix: "/"
215+
};
216+
};

.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# OpenAI-compatible API Configuration
2+
OPENAI_API_URL=http://....
3+
OPENAI_API_KEY=abcd1234
4+
OPENAI_MODEL=meta-llama/Llama-4-Maverick-17B-128E-Instruct

.github/workflows/deploy.yml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
name: Deploy to GitHub Pages
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
workflow_dispatch:
8+
9+
permissions:
10+
contents: read
11+
pages: write
12+
id-token: write
13+
14+
concurrency:
15+
group: "pages"
16+
cancel-in-progress: false
17+
18+
jobs:
19+
build:
20+
runs-on: ubuntu-latest
21+
steps:
22+
- name: Checkout
23+
uses: actions/checkout@v4
24+
25+
- name: Setup Node
26+
uses: actions/setup-node@v4
27+
with:
28+
node-version: '20'
29+
cache: 'npm'
30+
31+
- name: Install dependencies
32+
run: npm ci
33+
34+
- name: Build with Eleventy
35+
run: npm run build
36+
37+
- name: Upload artifact
38+
uses: actions/upload-pages-artifact@v3
39+
with:
40+
path: ./_site
41+
42+
deploy:
43+
environment:
44+
name: github-pages
45+
url: ${{ steps.deployment.outputs.page_url }}
46+
runs-on: ubuntu-latest
47+
needs: build
48+
steps:
49+
- name: Deploy to GitHub Pages
50+
id: deployment
51+
uses: actions/deploy-pages@v4

.gitignore

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Downloaded images
2+
downloads/
3+
4+
# Environment variables
5+
.env
6+
7+
# Dependencies
8+
node_modules/
9+
10+
# Build output
11+
_site/
12+
13+
# Python cache
14+
__pycache__/
15+
*.pyc
16+
*.pyo
17+
18+
# IDE
19+
.vscode/
20+
.idea/
21+
*.swp
22+
*.swo
23+
24+
# OS
25+
.DS_Store
26+
Thumbs.db

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 Epstein Files Archive
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

0 commit comments

Comments
 (0)