# Extract line by line

In [1]:
from pathlib import Path
import re
import json
# !pip install pymupdf
import fitz  # PyMuPDF

In [2]:
PDF_PATH = Path("/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.pdf")
PDF_PATH.exists(), PDF_PATH


(True,
 PosixPath('/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.pdf'))

In [3]:
def extract_pages_pymupdf(pdf_path: Path) -> list[dict]:
    pages = []
    with fitz.open(pdf_path) as doc:
        for i, page in enumerate(doc, start=1):
            text = page.get_text("text") or ""
            pages.append({"page": i, "text": text})
    return pages

pages = extract_pages_pymupdf(PDF_PATH)
len(pages), pages[0]["page"]


(140, 1)

In [4]:
print(pages[0]["text"][:1000])


Front page
Atria in brief
Atria’s key 
indicators
CEO’s review 
 
Strategy  
2021–2025
Strategy  
2016–2020
Operating 
environment
Business area 
reviews
Atria Finland
Atria Sweden
Atria Denmark & 
Estonia
Atria Russia
Research and 
Development 
 
Financial 
Statements 
and Corporate 
Governance
Financial 
Statements 
and Corporate 
Governance
Auditor’s report
Corporate 
Governance 
Statement
Investor reporting
Contact 
information
2020
Annual Report



In [5]:
def normalize_text(s: str) -> str:
    # Fix hyphenation at line breaks: "coro-\nnavirus" -> "coronavirus"
    s = re.sub(r"(\w)-\n(\w)", r"\1\2", s)
    # Convert newlines/tabs to spaces
    s = re.sub(r"[\t\r\n]+", " ", s)
    # Collapse repeated spaces
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s


In [6]:
for p in pages:
    p["text_norm"] = normalize_text(p["text"])

print(pages[0]["text_norm"][:1000])


Front page Atria in brief Atria’s key indicators CEO’s review Strategy 2021–2025 Strategy 2016–2020 Operating environment Business area reviews Atria Finland Atria Sweden Atria Denmark & Estonia Atria Russia Research and Development Financial Statements and Corporate Governance Financial Statements and Corporate Governance Auditor’s report Corporate Governance Statement Investor reporting Contact information 2020 Annual Report


In [7]:
_ABBREV = [
    "Mr.", "Ms.", "Mrs.", "Dr.", "Prof.", "Sr.", "Jr.",
    "e.g.", "i.e.", "etc.", "vs.", "No.", "Fig.", "pp.",
    "Inc.", "Ltd.", "Co.", "Corp.", "St.", "EU", "EUR", "Tel.",
    "Jan.", "Feb.", "Mar.", "Apr.", "Jun.", "Jul.", "Aug.", "Sep.", 
    "Sept.", "Oct.", "Nov.", "Dec."
]

def split_sentences(text: str) -> list[str]:
    if not text:
        return []

    placeholder = "<DOT>"
    protected = text
    for ab in _ABBREV:
        protected = protected.replace(ab, ab.replace(".", placeholder))

    parts = re.split(r"(?<=[.!?])\s+", protected)

    sents = []
    for p in parts:
        p = p.replace(placeholder, ".").strip()
        if len(p) >= 2:
            sents.append(p)
    return sents


In [8]:
test_sents = split_sentences(pages[0]["text_norm"])
len(test_sents), test_sents[:5]


(1,
 ['Front page Atria in brief Atria’s key indicators CEO’s review Strategy 2021–2025 Strategy 2016–2020 Operating environment Business area reviews Atria Finland Atria Sweden Atria Denmark & Estonia Atria Russia Research and Development Financial Statements and Corporate Governance Financial Statements and Corporate Governance Auditor’s report Corporate Governance Statement Investor reporting Contact information 2020 Annual Report'])

In [9]:
records = []
for p in pages:
    page_no = p["page"]
    sents = split_sentences(p["text_norm"])
    for idx, sent in enumerate(sents, start=1):
        records.append({
            "page": page_no,
            "sent_id_on_page": idx,
            "sentence": sent
        })

len(records), records[0]


(2462,
 {'page': 1,
  'sent_id_on_page': 1,
  'sentence': 'Front page Atria in brief Atria’s key indicators CEO’s review Strategy 2021–2025 Strategy 2016–2020 Operating environment Business area reviews Atria Finland Atria Sweden Atria Denmark & Estonia Atria Russia Research and Development Financial Statements and Corporate Governance Financial Statements and Corporate Governance Auditor’s report Corporate Governance Statement Investor reporting Contact information 2020 Annual Report'})

In [10]:
for r in records[:20]:
    print(f"[p.{r['page']:03d}] {r['sentence']}")


[p.001] Front page Atria in brief Atria’s key indicators CEO’s review Strategy 2021–2025 Strategy 2016–2020 Operating environment Business area reviews Atria Finland Atria Sweden Atria Denmark & Estonia Atria Russia Research and Development Financial Statements and Corporate Governance Financial Statements and Corporate Governance Auditor’s report Corporate Governance Statement Investor reporting Contact information 2020 Annual Report
[p.002] Front page Atria in brief Atria’s key indicators CEO’s review Strategy 2021–2025 Strategy 2016–2020 Operating environment Business area reviews Atria Finland Atria Sweden Atria Denmark & Estonia Atria Russia Research and Development Financial Statements and Corporate Governance Financial Statements and Corporate Governance Auditor’s report Corporate Governance Statement Investor reporting Contact information 2 Atria’s Annual Report 2020 Established in 1903, Atria is one of the leading meat and food companies in the Nordic countries, Russia and Est

In [11]:
out_jsonl = PDF_PATH.with_suffix(".sentences.jsonl")
out_txt = PDF_PATH.with_suffix(".sentences.txt")

with out_jsonl.open("w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

with out_txt.open("w", encoding="utf-8") as f:
    for r in records:
        f.write(f"[p.{r['page']:03d}] {r['sentence']}\n")

out_jsonl, out_txt


(PosixPath('/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.sentences.jsonl'),
 PosixPath('/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.sentences.txt'))

# Translation

In [12]:
from pathlib import Path
import re
import csv
import pandas as pd


TXT_PATH = PDF_PATH.with_suffix(".sentences.txt")   # created earlier
CSV_OUT  = PDF_PATH.with_suffix(".sentences_fi_en.csv")

TXT_PATH, TXT_PATH.exists()


(PosixPath('/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.sentences.txt'),
 True)

In [13]:
# If needed (uncomment):
# !pip -q install transformers sentencepiece sacremoses torch --upgrade

import torch
from transformers import pipeline

MODEL_NAME = "Helsinki-NLP/opus-mt-fi-en"

device = 0 if torch.cuda.is_available() else -1
translator = pipeline("translation", model=MODEL_NAME, device=device)

device


  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


0

In [14]:
line_re = re.compile(r"^\[p\.(\d+)\]\s+(.*)$")

rows = []
with TXT_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        m = line_re.match(line)
        if not m:
            # if a line doesn't match, keep it but mark page unknown
            rows.append({"page": None, "sentence_fi": line})
            continue
        rows.append({"page": int(m.group(1)), "sentence_fi": m.group(2)})

df = pd.DataFrame(rows)
df.head(), len(df)


(   page                                        sentence_fi
 0     1  Front page Atria in brief Atria’s key indicato...
 1     2  Front page Atria in brief Atria’s key indicato...
 2     2  Our company is highly appreciated by our custo...
 3     2  We have been producing food for more than 100 ...
 4     2  Atria’s renewal and growth are based on commer...,
 2462)

In [15]:
def normalize_spaces(s: str) -> str:
    s = re.sub(r"\s+", " ", s).strip()
    return s

def chunk_text(s: str, max_chars: int = 400) -> list[str]:
    """
    Chunk by characters (simple + robust).
    Tries to split on sentence punctuation; otherwise hard-splits.
    """
    s = s.strip()
    if len(s) <= max_chars:
        return [s]

    chunks = []
    start = 0
    while start < len(s):
        end = min(start + max_chars, len(s))
        # try to backtrack to a nice boundary
        boundary = max(s.rfind(".", start, end), s.rfind("!", start, end), s.rfind("?", start, end))
        if boundary <= start + 50:  # no good boundary found
            boundary = end
        else:
            boundary = boundary + 1
        chunks.append(s[start:boundary].strip())
        start = boundary
    return [c for c in chunks if c]

df["sentence_fi"] = df["sentence_fi"].astype(str).map(normalize_spaces)
df["n_chars"] = df["sentence_fi"].map(len)

df["n_chars"].describe()


count    2462.000000
mean      187.870431
std       266.465382
min         2.000000
25%        80.000000
50%       119.000000
75%       181.000000
max      3411.000000
Name: n_chars, dtype: float64

In [16]:
from math import ceil

def translate_sentences(sentences: list[str], batch_size: int = 32) -> list[str]:
    out = []
    n = len(sentences)
    n_batches = ceil(n / batch_size)

    for b in range(n_batches):
        batch = sentences[b*batch_size:(b+1)*batch_size]

        # translate each item, but chunk if needed
        translated_batch = []
        for s in batch:
            chunks = chunk_text(s, max_chars=400)
            if len(chunks) == 1:
                translated_batch.append(chunks[0])
            else:
                # mark multi-chunk with join later
                translated_batch.append(chunks)

        # Flatten for pipeline input
        flat_inputs = []
        flat_map = []  # index mapping back to original item
        for i, item in enumerate(translated_batch):
            if isinstance(item, list):
                for c in item:
                    flat_inputs.append(c)
                    flat_map.append(i)
            else:
                flat_inputs.append(item)
                flat_map.append(i)

        # Run pipeline
        preds = translator(flat_inputs)

        # Re-assemble
        temp = [""] * len(batch)
        multi_parts = [[] for _ in range(len(batch))]

        for pred, idx in zip(preds, flat_map):
            txt = pred["translation_text"]
            multi_parts[idx].append(txt)

        for i in range(len(batch)):
            temp[i] = " ".join(multi_parts[i]).strip()

        out.extend(temp)

        print(f"Translated batch {b+1}/{n_batches} ({len(out)}/{n})")

    return out

sentences = df["sentence_fi"].tolist()
df["sentence_en"] = translate_sentences(sentences, batch_size=32)

df[["page", "sentence_fi", "sentence_en"]].head(10)


Translated batch 1/77 (32/2462)
Translated batch 2/77 (64/2462)
Translated batch 3/77 (96/2462)
Translated batch 4/77 (128/2462)
Translated batch 5/77 (160/2462)
Translated batch 6/77 (192/2462)
Translated batch 7/77 (224/2462)
Translated batch 8/77 (256/2462)
Translated batch 9/77 (288/2462)
Translated batch 10/77 (320/2462)




Translated batch 11/77 (352/2462)
Translated batch 12/77 (384/2462)
Translated batch 13/77 (416/2462)
Translated batch 14/77 (448/2462)
Translated batch 15/77 (480/2462)
Translated batch 16/77 (512/2462)
Translated batch 17/77 (544/2462)
Translated batch 18/77 (576/2462)
Translated batch 19/77 (608/2462)
Translated batch 20/77 (640/2462)
Translated batch 21/77 (672/2462)
Translated batch 22/77 (704/2462)
Translated batch 23/77 (736/2462)
Translated batch 24/77 (768/2462)
Translated batch 25/77 (800/2462)
Translated batch 26/77 (832/2462)
Translated batch 27/77 (864/2462)
Translated batch 28/77 (896/2462)
Translated batch 29/77 (928/2462)
Translated batch 30/77 (960/2462)
Translated batch 31/77 (992/2462)
Translated batch 32/77 (1024/2462)
Translated batch 33/77 (1056/2462)
Translated batch 34/77 (1088/2462)
Translated batch 35/77 (1120/2462)
Translated batch 36/77 (1152/2462)
Translated batch 37/77 (1184/2462)
Translated batch 38/77 (1216/2462)
Translated batch 39/77 (1248/2462)
Transl

Unnamed: 0,page,sentence_fi,sentence_en
0,1,Front page Atria in brief Atria’s key indicato...,Front page Atria in brief Atria’s key indicato...
1,2,Front page Atria in brief Atria’s key indicato...,Front page Atria in brief Atria’s key indicato...
2,2,Our company is highly appreciated by our custo...,Our company is highly applied by our customers...
3,2,We have been producing food for more than 100 ...,We have been producing food for more than 100 ...
4,2,Atria’s renewal and growth are based on commer...,Atria’s renewal and grow are based on commerci...
5,2,"Our main product, good food, creates a better ...","Our main product, good food, creams a better f..."
6,2,"In 2020, our net sales were approximately EUR ...","In 2020, our net sales were approximately EUR ..."
7,2,Atria Plc’s shares have been listed on the Nas...,Atria Plc’s share have been listed on the Nasd...
8,2,AT R I A P RODUC E S S U S TA I NA BL E VA LU E 2,AT R I A P RODUC E S S U S TA I NA BL E VA LU E 2
9,3,Front page Atria in brief Atria’s key indicato...,Front page Atria in brief Atria’s key indicato...


In [17]:
df = df.reset_index(drop=True)
df["line_id"] = df.index + 1

df_out = df[["line_id", "page", "sentence_fi", "sentence_en"]]

df_out.to_csv(CSV_OUT, index=False, encoding="utf-8")
CSV_OUT, CSV_OUT.exists()


(PosixPath('/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.sentences_fi_en.csv'),
 True)

In [18]:
df_out.sample(10, random_state=0)


Unnamed: 0,line_id,page,sentence_fi,sentence_en
1831,1832,97,All business transactions that are entered int...,All business transactions that are entered int...
2090,2091,117,128 Remuneration ................................,128 Remuneration ................................
148,149,10,Channels • Demand and new opportunities in the...,Channels • Demand and new opportunities in the...
910,911,50,Atria’s operations could also become the targe...,Atria’s operations would also become the targe...
2207,2208,126,The members of the Board of Directors are obli...,The members of the Board of Directors are requ...
1587,1588,79,The discount’s impact on the financial period ...,The discount’s impact on the financial period ...
1656,1657,87,The maximum credit risk for loans and other re...,The maximum credit risk for loans and other cl...
1203,1204,59,"In 2020, Atria Group continued to implement th...","In 2020, Atria Group continued to implement th..."
1047,1048,55,"If materialised, the translation difference re...","If materialised, the translation difference re..."
2389,2390,134,Shortly about the remuneration in 2020 The Ann...,Shortly about the restoration in 2020 The Annu...


# Emotion analysis

In [19]:
# If needed (uncomment):
# !pip -q install transformers torch --upgrade

import json
import torch
import pandas as pd
from transformers import pipeline
from pathlib import Path


In [20]:
CSV_IN  = PDF_PATH.with_suffix(".sentences_fi_en.csv")
CSV_OUT = PDF_PATH.with_suffix(".sentences_fi_en_emotions.csv")


df = pd.read_csv(CSV_IN)
df.head(), df.columns, len(df)

(   line_id  page                                        sentence_fi  \
 0        1     1  Front page Atria in brief Atria’s key indicato...   
 1        2     2  Front page Atria in brief Atria’s key indicato...   
 2        3     2  Our company is highly appreciated by our custo...   
 3        4     2  We have been producing food for more than 100 ...   
 4        5     2  Atria’s renewal and growth are based on commer...   
 
                                          sentence_en  
 0  Front page Atria in brief Atria’s key indicato...  
 1  Front page Atria in brief Atria’s key indicato...  
 2  Our company is highly applied by our customers...  
 3  We have been producing food for more than 100 ...  
 4  Atria’s renewal and grow are based on commerci...  ,
 Index(['line_id', 'page', 'sentence_fi', 'sentence_en'], dtype='object'),
 2462)

In [21]:
MODEL_EMO = "SamLowe/roberta-base-go_emotions"

device = 0 if torch.cuda.is_available() else -1

emo_pipe = pipeline(
    task="text-classification",
    model=MODEL_EMO,
    device=device,
    top_k=None,             # return ALL labels
    truncation=True,        # avoid crashing on long inputs
)

device




0

In [22]:
from math import ceil

def analyze_emotions(texts: list[str], batch_size: int = 32):
    """
    Returns:
      top_label: list[str]
      top_score: list[float]
      scores_json: list[str]  # JSON string per row
    """
    top_labels, top_scores, scores_json = [], [], []

    n = len(texts)
    n_batches = ceil(n / batch_size)

    for b in range(n_batches):
        batch = texts[b*batch_size:(b+1)*batch_size]

        # Make sure we have strings, and avoid NaNs
        batch = [("" if pd.isna(t) else str(t)) for t in batch]

        preds = emo_pipe(batch)  # list (per input) of list-of-dicts: [{"label":..,"score":..}, ...]

        for label_scores in preds:
            # label_scores: list of {"label": "joy", "score": 0.123, ...}
            # Convert to dict
            d = {x["label"]: float(x["score"]) for x in label_scores}

            # Top emotion
            best_label, best_score = max(d.items(), key=lambda kv: kv[1]) if d else ("", 0.0)

            top_labels.append(best_label)
            top_scores.append(best_score)
            scores_json.append(json.dumps(d, ensure_ascii=False))

        print(f"Emotion batch {b+1}/{n_batches} ({len(top_labels)}/{n})")

    return top_labels, top_scores, scores_json

texts_en = df["sentence_en"].tolist()

df["top_emotion"], df["top_emotion_score"], df["emotion_scores_json"] = analyze_emotions(
    texts_en,
    batch_size=32
)

df[["sentence_en", "top_emotion", "top_emotion_score"]].head(10)


Emotion batch 1/77 (32/2462)
Emotion batch 2/77 (64/2462)
Emotion batch 3/77 (96/2462)
Emotion batch 4/77 (128/2462)
Emotion batch 5/77 (160/2462)
Emotion batch 6/77 (192/2462)
Emotion batch 7/77 (224/2462)
Emotion batch 8/77 (256/2462)
Emotion batch 9/77 (288/2462)
Emotion batch 10/77 (320/2462)




Emotion batch 11/77 (352/2462)
Emotion batch 12/77 (384/2462)
Emotion batch 13/77 (416/2462)
Emotion batch 14/77 (448/2462)
Emotion batch 15/77 (480/2462)
Emotion batch 16/77 (512/2462)
Emotion batch 17/77 (544/2462)
Emotion batch 18/77 (576/2462)
Emotion batch 19/77 (608/2462)
Emotion batch 20/77 (640/2462)
Emotion batch 21/77 (672/2462)
Emotion batch 22/77 (704/2462)
Emotion batch 23/77 (736/2462)
Emotion batch 24/77 (768/2462)
Emotion batch 25/77 (800/2462)
Emotion batch 26/77 (832/2462)
Emotion batch 27/77 (864/2462)
Emotion batch 28/77 (896/2462)
Emotion batch 29/77 (928/2462)
Emotion batch 30/77 (960/2462)
Emotion batch 31/77 (992/2462)
Emotion batch 32/77 (1024/2462)
Emotion batch 33/77 (1056/2462)
Emotion batch 34/77 (1088/2462)
Emotion batch 35/77 (1120/2462)
Emotion batch 36/77 (1152/2462)
Emotion batch 37/77 (1184/2462)
Emotion batch 38/77 (1216/2462)
Emotion batch 39/77 (1248/2462)
Emotion batch 40/77 (1280/2462)
Emotion batch 41/77 (1312/2462)
Emotion batch 42/77 (1344/246

Unnamed: 0,sentence_en,top_emotion,top_emotion_score
0,Front page Atria in brief Atria’s key indicato...,neutral,0.968818
1,Front page Atria in brief Atria’s key indicato...,neutral,0.954714
2,Our company is highly applied by our customers...,neutral,0.463996
3,We have been producing food for more than 100 ...,desire,0.628325
4,Atria’s renewal and grow are based on commerci...,neutral,0.458415
5,"Our main product, good food, creams a better f...",admiration,0.795339
6,"In 2020, our net sales were approximately EUR ...",neutral,0.905178
7,Atria Plc’s share have been listed on the Nasd...,neutral,0.916008
8,AT R I A P RODUC E S S U S TA I NA BL E VA LU E 2,neutral,0.965846
9,Front page Atria in brief Atria’s key indicato...,neutral,0.963276


In [23]:
df.to_csv(CSV_OUT, index=False, encoding="utf-8")
CSV_OUT, CSV_OUT.exists()


(PosixPath('/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.sentences_fi_en_emotions.csv'),
 True)

In [24]:
df["top_emotion"].value_counts().head(20)


top_emotion
neutral           2301
approval            77
admiration          30
disappointment      17
desire              15
optimism             9
realization          5
joy                  2
gratitude            2
caring               2
sadness              1
confusion            1
Name: count, dtype: int64

## expand the emotion scores

In [25]:
import pandas as pd
import json

# df = pd.read_csv("...sentences_fi_en_emotions.csv")  # if not already loaded

def json_to_dict(x):
    if pd.isna(x):
        return {}
    if isinstance(x, dict):
        return x
    return json.loads(x)

# Parse JSON strings into dicts
scores_dicts = df["emotion_scores_json"].apply(json_to_dict)

# Turn dicts into a DataFrame (columns = emotion labels)
emo_df = pd.json_normalize(scores_dicts)

# Optional: prefix to avoid name collisions
emo_df = emo_df.add_prefix("emo_")

# Join back to original df
df_wide = pd.concat([df.drop(columns=["emotion_scores_json"]), emo_df], axis=1)

df_wide.head()


Unnamed: 0,line_id,page,sentence_fi,sentence_en,top_emotion,top_emotion_score,emo_neutral,emo_approval,emo_realization,emo_annoyance,...,emo_surprise,emo_love,emo_gratitude,emo_caring,emo_embarrassment,emo_grief,emo_nervousness,emo_remorse,emo_relief,emo_pride
0,1,1,Front page Atria in brief Atria’s key indicato...,Front page Atria in brief Atria’s key indicato...,neutral,0.968818,0.968818,0.01234,0.00905,0.005001,...,0.001493,0.001287,0.001218,0.000908,0.000745,0.000538,0.000512,0.000454,0.000422,0.000421
1,2,2,Front page Atria in brief Atria’s key indicato...,Front page Atria in brief Atria’s key indicato...,neutral,0.954714,0.954714,0.031382,0.017173,0.004535,...,0.001244,0.00127,0.001022,0.000592,0.000607,0.00036,0.000319,0.000312,0.000551,0.000584
2,3,2,Our company is highly appreciated by our custo...,Our company is highly applied by our customers...,neutral,0.463996,0.463996,0.270396,0.012174,0.007221,...,0.000664,0.001522,0.00908,0.007746,0.000384,0.000378,0.000165,0.000482,0.002234,0.009241
3,4,2,We have been producing food for more than 100 ...,We have been producing food for more than 100 ...,desire,0.628325,0.269043,0.056491,0.009645,0.006358,...,0.001031,0.003514,0.003298,0.004081,0.00039,0.00049,0.000739,0.000801,0.000933,0.001136
4,5,2,Atria’s renewal and growth are based on commer...,Atria’s renewal and grow are based on commerci...,neutral,0.458415,0.458415,0.222293,0.032592,0.009401,...,0.001361,0.000744,0.015638,0.002543,0.00076,0.000567,0.000209,0.000529,0.002518,0.009101


In [26]:
df_wide = df_wide.fillna(0.0)


In [27]:
from pathlib import Path

CSV_WIDE = Path(str(CSV_OUT)).with_suffix("")  # if you used CSV_OUT earlier
CSV_WIDE = Path(str(CSV_WIDE) + "_wide.csv")

df_wide.to_csv(CSV_WIDE, index=False, encoding="utf-8")
CSV_WIDE


PosixPath('/home/lila/github-clones/X_colloctor/data/reports /38. Atria Oyj_annual_2020.sentences_fi_en_emotions_wide.csv')