In [None]:
!pip install pdfplumber

In [None]:
import pdfplumber
import pandas as pd
import re

In [None]:
def rename_descriptions(df: pd.DataFrame) -> pd.DataFrame:
    """rename descriptions to make it more readable.

    Args
        df (pd.DataFrame): dataframe of activity from bank statement
    Returns
        df (pd.DataFrame): rename bank activity descriptions
    """
    rename_descriptions = {
        "AMAZON": ["AMZN", "AMAZON"],
        "TESCO": ["TESCO"],
        "ASDA": ["ASDA"],
        "CO-OP": ["CO-OP"],
        "ARGOS": ["ARGOS"],
        "H&M": ["HANDM", "HMHENNE"],
        "LIDL": ["LIDL"],
        "COSTA": ["COSTA"],
        "BOOHOO": ["BOOHOO"],
        "ZARA": ["ZARA"],
        "NEXT": ["NEXT"],
        "NEWLOOK": ["NEWLOOK"],
        "SAINSBURY": ['SAINSBURY']
    }

    for shop in rename_descriptions.keys():
        for var in rename_descriptions[shop]:
            df.loc[df['description'].str.startswith(var), "description"] = shop

    return df

def read_bank_statement(filename: str) -> pd.DataFrame:
    """read bank statement from pdf.

    Args
        filename (str): filename for bank statement pdf
    Returns
        df (pd.DataFrame): bank transaction activity extracted from pdf
    """
    pdf = pdfplumber.open(filename)

    results = {
        "transaction_date": [],
        "description": [],
        "price": []
    }

    for page in pdf.pages:
        for text in page.extract_text_lines():
            if re.match("^([0-9]{2}[A-Z]{3}\s){2}[0-9]", text['text']):
                transaction_date, _, _, description, *_, price =  text['text'].split(" ")
            elif re.match("^([0-9]{2}[A-Z]{3}\s){2}[A-Z]", text['text']):
                transaction_date, _, description, *_, price =  text['text'].split(" ")
            else:
                continue
            results['transaction_date'].append(transaction_date)
            results['description'].append(description)
            results['price'].append(price)

    return pd.DataFrame(results)
    
df = read_bank_statement("tesco_july.pdf")

df = (
    df.assign(
        price=df.price.str.replace("[£,]", "", regex=True),
        description=df.description.str.upper().replace(".COM|WWW.|.CO.UK|HTTPS://", "", regex=True),
        transaction_date=pd.to_datetime(df.transaction_date+"2023", format='%d%b%Y')
    )
    .astype(
        {
            "price": "float64",
            "description": "string",
            "transaction_date": "string"
        }
    )
    .pipe(rename_descriptions)
    .sort_values(by="transaction_date")
)

incomings = df.price < 0
outgoings = df.price > 0

total_outgoings = round(sum(df[outgoings]['price']), 2)
total_incomings = round(sum(df[incomings]['price']), 2)

In [None]:
df.head()