In [1]:
import tiktoken
import PyPDF2

from langchain_community.document_loaders import PyPDFLoader

from llm.models.text_summary import get_text_summary

In [2]:
file_path = './data/upload_data/PPIA_45_2024_.pdf'

In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text


In [4]:
text = extract_text_from_pdf(file_path)

In [5]:
text

' www.risi.com | 1PPI Asia\nPAPER, PACKAGING AND FIBER MARKET NEWS AND PRICES FOR ASIA\nIn This Issue:\nPrice Watch: China Pulp 2\nPrice Watch: Asia RCP  3\nPrice Watch: China P&B 7\nChina’s Yueyang to buy Hunan Juntai  11\nRengo, Hansol to raise board prices  13China-driven demand bumps prices for US OCC \nimports into Southeast Asia, Taiwan\nMetsä Fibre has extended the maintenance downtime that started on \nNovember 8 at its Kemi pulp and bioproduct mill in northern Finland \nfrom one week to four weeks. But the protracted stoppage will not affect \nthe company’s customer deliveries, for which China is its major market.\nThe company found during the routine maintenance a part of  \nthe recovery boiler at the site needed to be repaired, thus extending  \nthe shut, Metsä Fibre said.\nA company spokesman said that the facility, which has \na capacity of 1.5 million tonnes per year of bleached and \nunbleached softwood and hardwood kraft pulp, has maintained \na higher-than-average leve

In [6]:
loader = PyPDFLoader(file_path)
pages = []
for page in loader.lazy_load():
    pages.append(page)

In [7]:
pages

[Document(metadata={'source': './data/upload_data/PPIA_45_2024_.pdf', 'page': 0}, page_content=' www.risi.com | 1\nPPI Asia\nPAPER, PACKAGING AND FIBER MARKET NEWS AND PRICES FOR ASIA\nIn This Issue:\nPrice Watch: China Pulp 2\nPrice Watch: Asia RCP  3\nPrice Watch: China P&B 7\nChina’s Yueyang to buy Hunan Juntai 11\nRengo, Hansol to raise board prices  13\nChina-driven demand bumps prices for US OCC \nimports into Southeast Asia, Taiwan\nMetsä Fibre has extended the maintenance downtime that started on \nNovember 8 at its Kemi pulp and bioproduct mill in northern Finland \nfrom one week to four weeks. But the protracted stoppage will not affect \nthe company’s customer deliveries, for which China is its major market.\nThe company found during the routine maintenance a part of  \nthe recovery boiler at the site needed to be repaired, thus extending  \nthe shut, Metsä Fibre said.\nA company spokesman said that the facility, which has \na capacity of 1.5 million tonnes per year of bleac

In [8]:
def count_tokens(text, model_name):
    # tiktoken supports different encodings for different models
    # For GPT-3.5-turbo or GPT-4, we can use the 'cl100k_base' encoding
    encoding = tiktoken.encoding_for_model(model_name)
    tokens = encoding.encode(text)
    return len(tokens)


In [9]:
count_tokens(text, model_name='gpt-4o')

19026

In [10]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [11]:
rec_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 2000,
            chunk_overlap = 0,
            length_function=len,
            is_separator_regex=False,
        )

In [12]:
splitted_doc = rec_splitter.transform_documents(pages)

In [13]:
splitted_doc[len(splitted_doc)-1]

Document(metadata={'source': './data/upload_data/PPIA_45_2024_.pdf', 'page': 17}, page_content='www.risi.com | 18\nDecember 4, 2024 \n11am EST  |  8am PST  |  4pm GMT\nRegister now: \nfmrkts.com/lessons-learned-from-london-pulp-week\nThis webinar offers insights into the future of the global pulp \nmarket in 2025. Participants will learn about Fastmarkets’ \npricing methodology for various pulp grades, key takeaways \nfrom the recent London Pulp Week and the impact of mill \nclosures and downtime on the market. \nAdditionally, the webinar will delve into  \npredictions for the year ahead, focusing on  \ntrends and price drivers. By understanding how \nmacroeconomic factors and global challenges will shape \nthe market, attendees can gain valuable knowledge to \nnavigate the evolving pulp landscape.\nWebinar: Lessons learned \nfrom London Pulp Week \nand predictions for 2025')

In [14]:
max_page = splitted_doc[len(splitted_doc)-1].metadata['page']
max_page

17

In [15]:
page_content = {}
for page in range(max_page+1):
    temp = [item for item in splitted_doc if item.metadata['page'] == page]
    page_content[page] = "".join(temp[i].page_content for i in range(len(temp)))

In [16]:
page_summary = {}
for page, content in page_content.items():
    page_summary[page] = get_text_summary(content)

In [17]:
summary_all = "".join(page_summary[page] for page, _ in page_summary.items())

In [18]:
print(summary_all)

重要資訊整理：

1. **價格觀察**：
   - 中國紙漿價格（Price Watch: China Pulp）
   - 亞洲廢紙價格（Price Watch: Asia RCP）
   - 中國紙與紙板價格（Price Watch: China P&B）

2. **企業動態**：
   - 中國岳陽紙業將收購湖南君泰（China’s Yueyang to buy Hunan Juntai）
   - Rengo和Hansol將提高紙板價格（Rengo, Hansol to raise board prices）

3. **市場需求與價格變動**：
   - 由於中國相關的再生紙漿和紙板廠在東南亞的需求強勁，推動了美國舊瓦楞紙箱（OCC）進口價格上漲，逆轉了自11月中旬以來的價格下跌趨勢。
   - 美國雙重分揀OCC（DS OCC 12）價格上漲至每噸180-185美元，對中國買家及印尼和馬來西亞的客戶，泰國、越南和台灣的價格為每噸175-180美元。
   - 歐洲OCC 98/2在泰國、越南和台灣的報價為每噸145-150美元，在印尼和馬來西亞的報價高出5美元。

4. **Metsä Fibre工廠維護**：
   - Metsä Fibre將芬蘭Kemi紙漿和生物產品工廠的維護停機時間從一週延長至四週，但不影響客戶交付。
   - 該工廠每年生產150萬噸漂白和未漂白的軟木和硬木牛皮紙漿，維持較高的庫存水平以確保及時交付。
   - 該工廠將於2025年6月至7月停機兩個月，以安裝新的蒸發器，替換在3月21日爆炸中受損的蒸發器。

5. **會議資訊**：
   - 2025年歐洲森林產品會議將於3月3日至5日在葡萄牙里斯本舉行（Forest Products Europe Conference 2025）。

這些資訊對於造紙業界的需求變化、原料價格和同業生態有重要參考價值。### 重要資訊整理

#### 中國進口價格 (美元/噸, CIF)
- **漂白針葉牛皮漿**
  - NBSK (北美和斯堪的納維亞): $765 (與上月和去年同期持平)
  - Radiata pine (智利): $760 - 780 (同比上升0.7%)
  - 俄羅斯BSK: $700 - 720 (同比下降3.4%)

- **漂白闊葉牛皮漿**
  - 

In [19]:
doc_summary = get_text_summary(summary_all)

In [21]:
print(doc_summary)

### 重要資訊整理

1. **價格觀察**：
   - **中國紙漿價格**（Price Watch: China Pulp）
   - **亞洲廢紙價格**（Price Watch: Asia RCP）
   - **中國紙與紙板價格**（Price Watch: China P&B）

2. **企業動態**：
   - **中國岳陽紙業將收購湖南君泰**（China’s Yueyang to buy Hunan Juntai）
   - **Rengo和Hansol將提高紙板價格**（Rengo, Hansol to raise board prices）

3. **市場需求與價格變動**：
   - **美國舊瓦楞紙箱（OCC）價格上漲**：由於中國相關的再生紙漿和紙板廠在東南亞的需求強勁，推動了美國OCC進口價格上漲，逆轉了自11月中旬以來的價格下跌趨勢。
   - **美國雙重分揀OCC（DS OCC 12）價格**：上漲至每噸180-185美元，對中國買家及印尼和馬來西亞的客戶，泰國、越南和台灣的價格為每噸175-180美元。
   - **歐洲OCC 98/2價格**：在泰國、越南和台灣的報價為每噸145-150美元，在印尼和馬來西亞的報價高出5美元。

4. **Metsä Fibre工廠維護**：
   - **維護停機時間延長**：Metsä Fibre將芬蘭Kemi紙漿和生物產品工廠的維護停機時間從一週延長至四週，但不影響客戶交付。
   - **生產能力**：該工廠每年生產150萬噸漂白和未漂白的軟木和硬木牛皮紙漿，維持較高的庫存水平以確保及時交付。
   - **未來停機計劃**：該工廠將於2025年6月至7月停機兩個月，以安裝新的蒸發器，替換在3月21日爆炸中受損的蒸發器。

5. **會議資訊**：
   - **2025年歐洲森林產品會議**：將於3月3日至5日在葡萄牙里斯本舉行（Forest Products Europe Conference 2025）。

6. **中國進口價格（美元/噸, CIF）**：
   - **漂白針葉牛皮漿**：
     - NBSK (北美和斯堪的納維亞): $765
     - Radiata pine (智利): $760 - 780
     - 俄羅斯BSK: $700 - 7