Skip to content

Commit

Permalink
feat: docx-pdf conversion (#40)
Browse files Browse the repository at this point in the history
* feat(document): add a way to convert pdf to docx using libreoffice

* feat(documents): add the ability to convert docx to pdf using libreoffice

* test(document): test both new features

* ci(github-actions): install libreoffice on github actions

* refactor(files): add a new method to the file interface to support mime types as they can differ from the know extensions

* refactor(documents): add a new constant to the document package for docx

* refactor(files): add the new method to every struct that implements the file interface

* build(docker): install libreoffice at the release stage

* docs(readme): add a new table on what's new in terms of file conversion

* build(dockerfile): replace debian bookworm with debian trixie just to have more up to date image

* refactor(pdf): add two buffers as stdout and stderr to see what's goin on at the time to convert pdf to docx

* fix(pdf): use %q to make the filename a double-quoted string safely escaped with Go syntax

* refactor(pdf): print the stderr when it is not empty

* refactor(docx): replicate the changes added to the pdf file
  • Loading branch information
danvergara committed Mar 22, 2024
1 parent 7ba6971 commit 24d6ac1
Show file tree
Hide file tree
Showing 16 changed files with 572 additions and 26 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: '1.21'
go-version: "1.21"
- name: Install libreoffice
run: |
sudo apt-get update
sudo apt-get -y install libreoffice
- name: Install dependencies
run: go get .
- name: Test
Expand Down
8 changes: 7 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@ COPY . .
RUN GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -o morphos .

# Deploy the application binary into a lean image
FROM debian:bookworm-slim AS release
FROM debian:trixie-slim AS release

WORKDIR /

RUN apt-get update \
&& apt-get install -y --no-install-recommends default-jre libreoffice libreoffice-java-common \
&& apt-get autoremove -y \
&& apt-get purge -y --auto-remove \
&& rm -rf /var/lib/apt/lists/*

COPY --from=builder /app/morphos /bin/morphos
COPY --from=builder /usr/share/fonts /usr/share/fonts

Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,16 @@ A modal will pop up with a preview of the converted image.

## Documents X Images

| | PNG | JPEG | GIF | WEBP | TIFF | BMP |
|-------|-------|--------|-------|--------|--------|-------|
| PDF |||||||
| | PNG | JPEG | GIF | WEBP | TIFF | BMP |
| --- | --- | ---- | --- | ---- | ---- | --- |
| PDF |||||||

## Documents X Documents

| | DOCX | PDF |
| ---- | ---- | --- |
| PDF || |
| DOCX | ||

## License
The MIT License (MIT). See [LICENSE](LICENSE) file for more details.
4 changes: 2 additions & 2 deletions pkg/files/document_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ func (d *DocumentFactory) NewFile(f string) (File, error) {
switch f {
case documents.PDF:
return documents.NewPdf(d.filename), nil
case documents.DOCX:
return new(documents.Docx), nil
case documents.DOCX, documents.DOCXMIMEType:
return documents.NewDocx(d.filename), nil
default:
return nil, fmt.Errorf("type file file %s not recognized", f)
}
Expand Down
5 changes: 3 additions & 2 deletions pkg/files/documents/documents.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package documents

const (
DOCX = "docx"
PDF = "pdf"
DOCX = "docx"
DOCXMIMEType = "vnd.openxmlformats-officedocument.wordprocessingml.document"
PDF = "pdf"

imageMimeType = "image/"
imageType = "image"
Expand Down
73 changes: 72 additions & 1 deletion pkg/files/documents/documents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ type documenter interface {
DocumentType() string
}

func TestPDFToImages(t *testing.T) {
func TestPDFTConvertTo(t *testing.T) {
type input struct {
filename string
mimetype string
Expand Down Expand Up @@ -88,6 +88,77 @@ func TestPDFToImages(t *testing.T) {
mimetype: "application/zip",
},
},
{
name: "pdf to docx",
input: input{
filename: "testdata/bitcoin.pdf",
mimetype: "application/pdf",
targetFileType: "Document",
targetFormat: "docx",
documenter: documents.NewPdf("bitcoin.pdf"),
},
expected: expected{
mimetype: "application/zip",
},
},
}

for _, tc := range tests {
tc := tc
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

inputDoc, err := os.ReadFile(tc.input.filename)
require.NoError(t, err)

detectedFileType := mimetype.Detect(inputDoc)
require.Equal(t, tc.input.mimetype, detectedFileType.String())

outoutFile, err := tc.input.documenter.ConvertTo(
tc.input.targetFileType,
tc.input.targetFormat,
inputDoc,
)

require.NoError(t, err)

detectedFileType = mimetype.Detect(outoutFile)
require.Equal(t, tc.expected.mimetype, detectedFileType.String())
})
}
}

func TestDOCXTConvertTo(t *testing.T) {

type input struct {
filename string
mimetype string
targetFileType string
targetFormat string
documenter documenter
}
type expected struct {
mimetype string
}
var tests = []struct {
name string
input input
expected expected
}{
{

name: "docx to pdf",
input: input{
filename: "testdata/file_sample.docx",
mimetype: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
targetFileType: "Document",
targetFormat: "pdf",
documenter: documents.NewDocx("file_sample.docx"),
},
expected: expected{
mimetype: "application/zip",
},
},
}

for _, tc := range tests {
Expand Down
195 changes: 189 additions & 6 deletions pkg/files/documents/docx.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,200 @@
package documents

import "errors"
import (
"archive/zip"
"bytes"
"errors"
"fmt"
"io"
"log"
"os"
"os/exec"
"path/filepath"
"slices"
"strings"
)

type Docx struct{}
// Docx struct implements the File and Document interface from the file package.
type Docx struct {
filename string
compatibleFormats map[string][]string
compatibleMIMETypes map[string][]string
OutDir string
}

// NewDocx returns a pointer to Docx.
func NewDocx(filename string) *Docx {
d := Docx{
filename: filename,
compatibleFormats: map[string][]string{
"Document": {
PDF,
},
},
compatibleMIMETypes: map[string][]string{
"Document": {
PDF,
},
},
}

return &d
}

func (p *Docx) SupportedFormats() map[string][]string {
return make(map[string][]string)
// SupportedFormats returns a map witht the compatible formats that Docx is
// compatible to be converted to.
func (d *Docx) SupportedFormats() map[string][]string {
return d.compatibleFormats
}

func (p *Docx) ConvertTo(fileType, subType string, fileBytes []byte) ([]byte, error) {
// SupportedMIMETypes returns a map witht the compatible MIME types that Docx is
// compatible to be converted to.
func (d *Docx) SupportedMIMETypes() map[string][]string {
return d.compatibleMIMETypes
}

func (d *Docx) ConvertTo(fileType, subType string, fileBytes []byte) ([]byte, error) {
compatibleFormats, ok := d.SupportedFormats()[fileType]
if !ok {
return nil, fmt.Errorf("file type not supported: %s", fileType)
}

if !slices.Contains(compatibleFormats, subType) {
return nil, fmt.Errorf("sub-type not supported: %s", subType)
}

switch strings.ToLower(fileType) {
case documentType:
switch subType {
case PDF:
var (
stdout bytes.Buffer
stderr bytes.Buffer
)

docxFilename := filepath.Join("/tmp", d.filename)
pdfFileName := fmt.Sprintf(
"%s.pdf",
strings.TrimSuffix(d.filename, filepath.Ext(d.filename)),
)
tmpPdfFileName := filepath.Join("/tmp", fmt.Sprintf(
"%s.pdf",
strings.TrimSuffix(d.filename, filepath.Ext(d.filename)),
))

// Parses the file name of the Zip file.
zipFileName := filepath.Join("/tmp", fmt.Sprintf(
"%s.zip",
strings.TrimSuffix(d.filename, filepath.Ext(d.filename)),
))

docxFile, err := os.Create(docxFilename)
if err != nil {
return nil, fmt.Errorf(
"error creating file to store the incoming docx locally %s: %w",
d.filename,
err,
)
}
defer docxFile.Close()

if _, err := docxFile.Write(fileBytes); err != nil {
return nil, fmt.Errorf(
"error storing the incoming pdf file %s: %w",
d.filename,
err,
)
}

tmpPdfFile, err := os.Create(tmpPdfFileName)
if err != nil {
return nil, fmt.Errorf(
"error at creating the pdf file to store the pdf content: %w",
err,
)
}

cmdStr := "libreoffice --headless --convert-to pdf:writer_pdf_Export --outdir %s %q"
cmd := exec.Command(
"bash",
"-c",
fmt.Sprintf(cmdStr, "/tmp", docxFilename),
)

cmd.Stdout = &stdout
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
return nil, fmt.Errorf(
"error converting docx to pdf using libreoffice: %s",
err,
)
}

if stderr.String() != "" {
return nil, fmt.Errorf(
"error converting docx to pdf calling libreoffice: %s",
stderr.String(),
)
}

log.Println(stdout.String())

tmpPdfFile.Close()

tmpPdfFile, err = os.Open(tmpPdfFileName)
if err != nil {
return nil, fmt.Errorf(
"error at opening the pdf file: %w",
err,
)
}
defer tmpPdfFile.Close()

// Creates the zip file that will be returned.
archive, err := os.Create(zipFileName)
if err != nil {
return nil, fmt.Errorf(
"error at creating the zip file to store the pdf file: %w",
err,
)
}

// Creates a Zip Writer to add files later on.
zipWriter := zip.NewWriter(archive)

w1, err := zipWriter.Create(pdfFileName)
if err != nil {
return nil, fmt.Errorf(
"eror at creating a zip file: %w",
err,
)
}

if _, err := io.Copy(w1, tmpPdfFile); err != nil {
return nil, fmt.Errorf(
"error at writing the pdf file content to the zip writer: %w",
err,
)
}

// Closes both zip writer and the zip file after its done with the writing.
zipWriter.Close()
archive.Close()

// Reads the zip file as an slice of bytes.
zipFile, err := os.ReadFile(zipFileName)
if err != nil {
return nil, fmt.Errorf("error reading zip file: %v", err)
}

return zipFile, nil
}
}

return nil, errors.New("not implemented")
}

func (p *Docx) DocumentType() string {
func (d *Docx) DocumentType() string {
return DOCX
}

0 comments on commit 24d6ac1

Please sign in to comment.