#### Setup

In [1]:
import os
import fitz
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

%load_ext autoreload
%autoreload 2
from clean_extract import extract_text_from_pdf, convert_attachments_to_txt

In [2]:
feedbacks = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
attachments = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/attachments.csv"
)
df = pd.merge(feedbacks, attachments)
df["filename"] = (
    "../24212003_requirements_for_artificial_intelligence/" + df["filename"]
)

#### Convert attachments to txt

In [3]:
convert_attachments_to_txt(source_df=df)



#### Text cleanup removes some (empty) pages

In [4]:
files = [
    "../24212003_requirements_for_artificial_intelligence/attachments/" + file
    for file in os.listdir(
        "../24212003_requirements_for_artificial_intelligence/attachments/"
    )
    if file.endswith(".pdf")
]
for file in files:
    output = extract_text_from_pdf(
        file,
    )
    pages = len(output.split("\n\n"))
    pages_ = fitz.open(file).page_count  # type: ignore
    if pages != pages_:
        print(f"File {file[file.rindex('/')+1:]}")
        print(
            f"{str(pages).rjust(2)} and {str(pages_).rjust(2)} pages extracted and "
            + "expected, respectively"
        )
        print()

File 2665618.pdf
 6 and  8 pages extracted and expected, respectively

File 2665508.pdf
21 and 22 pages extracted and expected, respectively

File 2665462.pdf
30 and 31 pages extracted and expected, respectively

File 2665436.pdf
 1 and  8 pages extracted and expected, respectively

File 2662925.pdf
 3 and  4 pages extracted and expected, respectively



Why?
- 2665618 and 2662925: non-text page(s) 
- 2665508 and 2665462: empty page
- 2665436: file that can not be read in

#### Multiprocessed conversion leads to massive increase in performance

In [5]:
import multiprocessing

print(f"Local CPU: {multiprocessing.cpu_count()} cores")

Local CPU: 4 cores


In [6]:
if not os.path.isdir("/tmp/pdf_extraction"):
    os.mkdir("/tmp/pdf_extraction")
times = []
for _ in tqdm(range(50)):
    start = time.time()
    convert_attachments_to_txt(source_df=df, target_folder="/tmp/pdf_extraction/")
    times = time.time() - start
np.mean(times)

  0%|          | 0/50 [00:00<?, ?it/s]





12.953213691711426

In [7]:
times = []
for _ in tqdm(range(50)):
    start = time.time()
    convert_attachments_to_txt(
        source_df=df, n_jobs=1, target_folder="/tmp/pdf_extraction/"
    )
    times = time.time() - start
np.mean(times)

  0%|          | 0/50 [00:00<?, ?it/s]



24.381266593933105

##### Results from Google Colab:

- 11.6s without multiprocessing (n=50)
- 9.9 with multiprocessing (n_jobs=2, n=50)
- We can conclude that a larger number of cores is likely to make multiprocessing more beneficial

#### Dataloader demo

In [8]:
from dataloader import Dataloader

In [9]:
df = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
)
df = df
df = Dataloader("document", tokenize=False).from_folder(
    "../24212003_requirements_for_artificial_intelligence/attachments/", df
)
df.head()

As n_jobs=-1 <= 0, enabling multiprocessing with 4 cores!


Unnamed: 0,id,text,language,country,user_type,organization,surname,feedback,status,company_size,...,date_feedback,publication,publication_id,publication_status,tr_number,scope,governance_level,full_name,source,language_detected
0,2665651,Equinet welcomes the opportunity to provide co...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,2021-08-06 23:57:37,anonymous,24212003,closed,,,,,attachment,en
1,2665650,AI Austria welcomes the opportunity to comment...,en,AUT,ngo,AI Austria,Gorzala,AI Austria welcomes the opportunity to comment...,PUBLISHED,small,...,2021-08-06 23:55:26,withinfo,24212003,closed,,,,Jeannette Gorzala,attachment,en
2,2665649,This submission to the AIA consultation is sen...,en,DEU,ngo,Digitalcourage e.V.,,This submission to the AIA consultation is sen...,PUBLISHED,small,...,2021-08-06 23:53:39,anonymous,24212003,closed,,,,,attachment,en
3,2665648,The EU AI Act is an important step in the righ...,en,USA,academic_research_institution,UC Berkeley Center for Human-Compatible AI,,The EU AI Act is an important step in the righ...,PUBLISHED,small,...,2021-08-06 23:53:31,anonymous,24212003,closed,,,,,attachment,en
4,2665647,In response to the European Commission’s reque...,en,USA,company,CrowdStrike,,In response to the European Commission’s reque...,PUBLISHED,large,...,2021-08-06 23:50:16,anonymous,24212003,closed,,,,,attachment,en


In [10]:
len(df)

299

In [11]:
for _, row in df.query("language != language_detected").iterrows():
    print("#"*100)
    print(row["id"], row["language"], row["language_detected"])
    if row["source"] == "attachment":
        print(row["text"].split("\n\n")[1][:200])
    else:
        print(row["text"][:200])
    print("#"*100, "\n")

####################################################################################################
2665617 fr en
About the FFA 
The French Insurance Federation (FFA) represents 280 insurance and reinsurance companies operating in France, accounting for over 
99% of the French insurance market. We represent the i
#################################################################################################### 

####################################################################################################
2665479 nl en
Position Paper on proposed Artificial 
Intelligence Act (AIA) 
Who we are and why our input matters 
We are the Belgian, Dutch, French and German CIO-associations; the communities of Chief 
Informatio
#################################################################################################### 

####################################################################################################
2665472 en pl
ZPP od dawna podkreślał, że przygotowanie adekw

In [13]:
df = pd.read_csv(
    "../24212003_requirements_for_artificial_intelligence/patched_feedbacks.csv"
).iloc[:10]
df = Dataloader("page").from_folder(
    "../24212003_requirements_for_artificial_intelligence/attachments/", df
)
df.head()

As n_jobs=-1 <= 0, enabling multiprocessing with 4 cores!
As n_jobs=-1 <= 0, enabling multiprocessing with 4 cores!


Unnamed: 0,id,text,language,country,user_type,organization,surname,feedback,status,company_size,...,publication,publication_id,publication_status,tr_number,scope,governance_level,full_name,source,language_detected,tokenized
0,2665651,Equinet welcomes the opportunity to provide co...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,anonymous,24212003,closed,,,,,attachment,en,"[equinet, welcome, the, opportunity, to, provi..."
1,2665651,Equinet’s feedback to the European Commission'...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,anonymous,24212003,closed,,,,,attachment,en,"[equinet, feedback, to, the, european, commiss..."
2,2665651,"equality bodies, alongside with other sectoral...",en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,anonymous,24212003,closed,,,,,attachment,en,"[equality, body, alongside, with, other, secto..."
3,2665651,investment in 1) digital literacy for those wh...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,anonymous,24212003,closed,,,,,attachment,en,"[investment, in, 1, digital, literacy, for, th..."
4,2665651,4. Require equality and human rights impact as...,en,BEL,ngo,Equinet,,Equinet welcomes the opportunity to provide co...,PUBLISHED,micro,...,anonymous,24212003,closed,,,,,attachment,en,"[4, require, equality, and, human, right, impa..."
