# Processing

Source: [MET Data](https://pacompendium.com/wp-content/uploads/2024/03/1_2024-adult-compendium_1_2024.pdf)

In [1]:
!pip install PyPDF2 --quiet

In [2]:
from PyPDF2 import PdfReader
import pandas as pd
from typing import List
import re

In [9]:
reader = PdfReader("1_2024-adult-compendium_1_2024.pdf")

def read_rows(pages: reader.pages) -> List[str]:
    rows = []
    for page in pages:
        text = page.extract_text().replace("\xa0", "")
        rows.extend(text.split("\n"))
    return rows

def fix_trailing_rows(rows: List[str]) -> List[str]:
    cleaned_rows = []
    for idx,row in enumerate(rows[1:]):
        if row.islower() or "Kumina" in row or '210)' in row:
            cleaned_row = rows[idx] + f" {row}"
            cleaned_rows.append(cleaned_row)
        else:
            cleaned_rows.append(row)
    return cleaned_rows[1:]
    
def clean_rows(rows: List[str]) -> List[List[str]]:
    clean_rows = []
    for row in rows:
        split_row = row.split(" ")
        for item in split_row:
            if item.isnumeric():
                category_idx = split_row.index(item)
                break
        activity = " ".join(split_row[:category_idx]) 
        category = split_row[category_idx]
        mte = split_row[category_idx + 1]
        description = " ".join(split_row[category_idx + 2:])
        clean_row = [activity, category, mte, description]
             
        clean_rows.append(clean_row)
    for row in clean_rows:
        rows_merged = re.split(r'(\d+\.\d+)', row[2])[1:]
        if (len(rows_merged) == 2):
            mte, description = rows_merged
            row[2] = float(mte)
            row[3] = f"{description} " + row[3]
            row[3] = row[3].strip()
    return clean_rows

rows = read_rows(reader.pages)
rows_non_trailing = fix_trailing_rows(rows)
cleaned_rows = clean_rows(rows_non_trailing)

In [10]:
df = pd.DataFrame(cleaned_rows, columns=["Actvitiy", "Code", "MET", "Description"])

In [12]:
df.to_csv("met.csv")