# Part 1: Working with Regex

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib
import re

## Intro to Regex

In [14]:
pattern = r"\d{10}"
text_to_search = "Hello my name is sebastien, you can call me on 0664533519. See you!"
re.findall(pattern, text_to_search)

['0664533519']

In [33]:
date_pattern = r"\d{2}-\d{2}-\d{4}"
text_to_search = "Receipt Number 103402 ||| 15-02-2017"
re.findall(date_pattern, text_to_search)

['15-02-2017']

In [34]:
zip_code_pattern = r"\d{5}"
text_to_search = "I moved to Paris 75011, it's closer to my workplace."
re.findall(zip_code_pattern, text_to_search)

['75011']

In [35]:
quantity_pattern = r"Quantity +\d+"
amount_pattern = r"Total Amount +\d+\.\d{2} €"

## Groups

In [36]:
# TEST THIS IN YOUR NOTEBOOK
pattern = r"Receipt Number (\d+)"
text_to_search = "Receipt Number 103402 ||| 15-02-2017"
re.findall(pattern, text_to_search)

['103402']

In [37]:
quantity_group_pattern = r"Quantity +(\d+)"
amount_group_pattern = r"Total Amount +(\d+\.\d{2}) €"

In [38]:
from nbresult import ChallengeResult

result = ChallengeResult('patterns',
    zipcode_re=zip_code_pattern,
    date_re=date_pattern,
    quantity_re=quantity_pattern,
                         
    amount_re=amount_pattern,
    quantity_grp_re=quantity_group_pattern,
    amount_grp_re=amount_group_pattern
)
result.write()
print(result.check())


platform darwin -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /Users/francoisgirard/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/francoisgirard/code/francoisgirard51/02-Data-Toolkit/02-Data-Sourcing/data-text-extraction-with-regex/tests
plugins: asyncio-0.19.0, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 6 items

test_patterns.py::TestPatterns::test_amount_group_pattern [32mPASSED[0m[32m         [ 16%][0m
test_patterns.py::TestPatterns::test_amount_pattern [32mPASSED[0m[32m               [ 33%][0m
test_patterns.py::TestPatterns::test_date_pattern [32mPASSED[0m[32m                 [ 50%][0m
test_patterns.py::TestPatterns::test_quantity_group_pattern [32mPASSED[0m[32m       [ 66%][0m
test_patterns.py::TestPatterns::test_quantity_pattern [32mPASSED[0m[32m             [ 83%][0m
test_patterns.py::TestPatterns::test_zip_code_pattern [32mPASSED[0m[32m             [100%][0m



💯 You can commit your code:

[1;32mgi

# Part 2: From a text file to a DataFrame

In [88]:
filepath = "data/receipts.txt"
with open(filepath, encoding="utf-8") as f:
    receipts_str = f.read()
print(receipts_str[:500])




Receipt Number 102790 ||| 02-01-2017 
------------------------------------

Quantity                         163
Total Amount               3097.00 €

************************************


Receipt Number 102862 ||| 05-01-2017 
------------------------------------

Quantity                         110
Total Amount                935.00 €

************************************


Receipt Number 103086 ||| 23-01-2017 
-----


In [89]:
receipts_list = receipts_str.split("====================================")
len(receipts_list)

100

In [90]:
receipts_dict = {
    "date": [],
    "quantity": [],
    "amount": []
}

In [91]:
for receipt in receipts_list:
    print(receipt)




Receipt Number 102790 ||| 02-01-2017 
------------------------------------

Quantity                         163
Total Amount               3097.00 €



************************************


Receipt Number 102862 ||| 05-01-2017 
------------------------------------

Quantity                         110
Total Amount                935.00 €



************************************


Receipt Number 103086 ||| 23-01-2017 
------------------------------------

Quantity                         156
Total Amount               2808.00 €



************************************


Receipt Number 103193 ||| 31-01-2017 
------------------------------------

Quantity                         182
Total Amount               4368.00 €



************************************


Receipt Number 103270 ||| 06-02-2017 
------------------------------------

Quantity                          97
Total Amount               1988.50 €



************************************


Receipt Number 103402 ||| 15-02-2017 

In [92]:
for receipt in receipts_list:
    date_pattern = r"\d{2}-\d{2}-\d{4}"
    date = re.findall(date_pattern, receipt)[0]

In [93]:
for receipt in receipts_list:
    date_pattern = r"\d{2}-\d{2}-\d{4}"
    date = re.findall(date_pattern, receipt)[0]
    amount_pattern = r"Total Amount +(\d+\.\d{2}) €"
    amount = re.findall(amount_pattern, receipt)[0]

In [94]:
for receipt in receipts_list:
    date_pattern = r"\d{2}-\d{2}-\d{4}"
    date = re.findall(date_pattern, receipt)[0]
    amount_pattern = r"Total Amount +(\d+\.\d{2}) €"
    amount = re.findall(amount_pattern, receipt)[0]
    quantity_pattern = r"Quantity +(\d+)"
    quantity = re.findall(quantity_pattern, receipt)[0]

In [95]:
 print(f"date: {date}, amount: {amount}, quantity: {quantity}")

date: 15-12-2017, amount: 1850.00, quantity: 148


In [96]:
for receipt in receipts_list:
    date_pattern = r"\d{2}-\d{2}-\d{4}"
    date = re.findall(date_pattern, receipt)[0]
    amount_pattern = r"Total Amount +(\d+\.\d{2}) €"
    amount = re.findall(amount_pattern, receipt)[0]
    quantity_pattern = r"Quantity +(\d+)"
    quantity = re.findall(quantity_pattern, receipt)[0]
    receipts_dict["date"].append(date)
    receipts_dict["amount"].append(amount)
    receipts_dict["quantity"].append(quantity)

In [97]:
receipts_df = pd.DataFrame.from_dict(receipts_dict)
receipts_df.head()

Unnamed: 0,date,quantity,amount
0,02-01-2017,163,3097.0
1,05-01-2017,110,935.0
2,23-01-2017,156,2808.0
3,31-01-2017,182,4368.0
4,06-02-2017,97,1988.5


In [98]:
from nbresult import ChallengeResult

result = ChallengeResult('receipts',
    raw=receipts_list,
    receipts=receipts_dict,
    df_size=receipts_df.shape,
    receipt=receipts_df.iloc[-1, :]
)
result.write()
print(result.check())


platform darwin -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /Users/francoisgirard/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /Users/francoisgirard/code/francoisgirard51/02-Data-Toolkit/02-Data-Sourcing/data-text-extraction-with-regex/tests
plugins: asyncio-0.19.0, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 4 items

test_receipts.py::TestReceipts::test_receipt [32mPASSED[0m[32m                      [ 25%][0m
test_receipts.py::TestReceipts::test_receipts_df_size [32mPASSED[0m[32m             [ 50%][0m
test_receipts.py::TestReceipts::test_receipts_dict [32mPASSED[0m[32m                [ 75%][0m
test_receipts.py::TestReceipts::test_receipts_list [32mPASSED[0m[32m                [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/receipts.pickle

[32mgit[39m commit -m [33m'Completed receipts step'[39m

[32mgit[39m push origin master



# Part 3: Data Visualization

In [85]:
receipts_df['date'] = pd.to_datetime(receipts_df['date'], format="%d-%m-%Y")

NameError: name 'receipts_df' is not defined