## Get the dependencies

Import necessary namespaces for handling web requests, JSON operations, file I/O, and Azure services. These imports prepare the environment for executing tasks related to document intelligence and AI processing.

In [None]:
#r "nuget:System.Text.Json, 8.0.1"
#r "nuget:DotNetEnv, 3.0.0"
#r "nuget:Azure.AI.OpenAI, 1.0.0-beta.14"
#r "nuget:Azure.AI.DocumentIntelligence, 1.0.0-beta.2"

using System.Net;
using System.Net.Http;
using System.Text.Json.Nodes;
using System.Text.Json;
using System.IO; 
using Azure;
using Azure.AI.DocumentIntelligence;
using Azure.AI.OpenAI;   

## Deploy the required services and update the code below to match

In [None]:
var openAIEndpoint = "";
var openAIApiKey = "";
var openAIModelDeployment = "gpt-4-turbo";
var openAIApiVersion = "2022-02-15-preview";
var documentIntelligenceEndpoint = "";
var documentIntelligenceApiKey = "";

var documentIntelligenceClient = new DocumentIntelligenceClient(new Uri(documentIntelligenceEndpoint), new AzureKeyCredential(documentIntelligenceApiKey));
var openAIClient = new OpenAIClient(new Uri(openAIEndpoint), new AzureKeyCredential(openAIApiKey));

## Convert the input PDFs into single markdown files in one shot

In [None]:
// Assuming you have the necessary using directives and the documentIntelligenceClient is already configured

string inputFolderPath = @".\Input"; 
string outputFolderPath = @".\ToMarkdownDocIntelligence";

// Ensure the output directory exists
if (!Directory.Exists(outputFolderPath))
{
    Directory.CreateDirectory(outputFolderPath);
}

// Get all PDF files from the input folder
foreach (string pdfFilePath in Directory.GetFiles(inputFolderPath, "*.pdf"))
{
    // Determine the output file path for the Markdown
    string outputFileName = Path.GetFileNameWithoutExtension(pdfFilePath) + ".md";
    string outputFilePath = Path.Combine(outputFolderPath, outputFileName);

    // Skip the file if the output already exists
    if (File.Exists(outputFilePath))
    {
        Console.WriteLine($"Skipping {pdfFilePath}, Markdown output already exists.");
        continue;
    }

    // Load the PDF file
    var markdownAnalysisContent = new AnalyzeDocumentContent()
    {
        Base64Source = BinaryData.FromBytes(File.ReadAllBytes(pdfFilePath))
    };

    // Analyze the document for Markdown content
    var markdownAnalysisOperation = await documentIntelligenceClient.AnalyzeDocumentAsync(
        WaitUntil.Completed, 
        "prebuilt-layout", 
        markdownAnalysisContent, 
        outputContentFormat: ContentFormat.Markdown);
    var markdown = markdownAnalysisOperation.Value.Content;

    // Save the Markdown output
    File.WriteAllText(outputFilePath, markdown);
    Console.WriteLine($"Processed {pdfFilePath} and saved Markdown to {outputFileName}");
}


## Convert the single page images from the PDFs into individual markdown files

**Note that this requires you to make images using PdfToTextPages.ipynb first!**

In [None]:
string inputFolderPath = @".\ToImages"; 
string outputFolderPath = @".\ToMarkdownDocIntelligencePages";

// Ensure the output directory exists
if (!Directory.Exists(outputFolderPath))
{
    Directory.CreateDirectory(outputFolderPath);
}

// Get all subfolders from the input folder
foreach (string inputSubfolderPath in Directory.GetDirectories(inputFolderPath))
{
    // Create corresponding subfolder in the output folder
    string subfolderName = Path.GetFileName(inputSubfolderPath);
    string outputSubfolderPath = Path.Combine(outputFolderPath, subfolderName);

    if (!Directory.Exists(outputSubfolderPath))
    {
        Directory.CreateDirectory(outputSubfolderPath);
    }

    // Process each JPEG file in the current subfolder
    foreach (string jpegFilePath in Directory.GetFiles(inputSubfolderPath, "*.jpeg"))
    {
        // Determine the output file path for the Markdown
        string outputFileName = Path.GetFileNameWithoutExtension(jpegFilePath) + ".md";
        string outputFilePath = Path.Combine(outputSubfolderPath, outputFileName);

        // Skip the file if the output already exists
        if (File.Exists(outputFilePath))
        {
            Console.WriteLine($"Skipping {jpegFilePath}, Markdown output already exists.");
            continue;
        }

        // Load the JPEG file
        var markdownAnalysisContent = new AnalyzeDocumentContent()
        {
            Base64Source = BinaryData.FromBytes(File.ReadAllBytes(jpegFilePath))
        };        

        // Analyze the document for Markdown content
        var markdownAnalysisOperation = await documentIntelligenceClient.AnalyzeDocumentAsync(
            WaitUntil.Completed, 
            "prebuilt-layout", 
            markdownAnalysisContent,
            features: new DocumentAnalysisFeature[] { DocumentAnalysisFeature.OcrHighResolution },
            outputContentFormat: ContentFormat.Markdown);
        var markdown = markdownAnalysisOperation.Value.Content;

        // Save the Markdown output
        File.WriteAllText(outputFilePath, markdown);
        Console.WriteLine($"Processed {jpegFilePath} and saved Markdown to {outputFileName}");
    }
}


## Stitch the individual pages into a single markdown file

In [None]:
// Iterate over each subfolder in the output folder
foreach (string outputSubfolderPath in Directory.GetDirectories(outputFolderPath))
{
    // Get the name of the subfolder to use as the final Markdown file name
    string folderName = Path.GetFileName(outputSubfolderPath);
    string finalMarkdownFilePath = Path.Combine(outputFolderPath, folderName + ".md");

    // Get all Markdown files in the current subfolder
    var markdownFiles = Directory.GetFiles(outputSubfolderPath, "*.md");

    // Read and concatenate the contents of all Markdown files
    var finalMarkdownContent = new StringBuilder();
    foreach (string markdownFilePath in markdownFiles)
    {
        finalMarkdownContent.AppendLine(File.ReadAllText(markdownFilePath));
        finalMarkdownContent.AppendLine("\n---\n"); // Optional: Add a separator between pages
    }

    // Save the concatenated content to the final Markdown file
    File.WriteAllText(finalMarkdownFilePath, finalMarkdownContent.ToString());
    Console.WriteLine($"Created {finalMarkdownFilePath}");
}