# Chapter 7 Guide

## 7.1

In [31]:
import re  #A

# Example log entries  #B
logs = [  #C
    "ERROR 2025-06-09 12:34:56 Server failed to respond",  #D
    "INFO 2025-06-09 12:35:56 User logged in",  #E
    "WARNING 2025-06-09 12:36:56 Disk space low"  #F
]

# Multi-part pattern with capture groups  #G
pattern = r"(ERROR|INFO|WARNING)\s(\d{4}-\d{2}-\d{2})\s(\d{2}:\d{2}:\d{2})"  #H

# Extract log type, date, and time from each log entry  #I
for log in logs:  #J
    match = re.search(pattern, log)  #K
    if match:  #L
        log_type, date, time = match.groups()  #M
        print(f"Type: {log_type}, Date: {date}, Time: {time}")  #N

Type: ERROR, Date: 2025-06-09, Time: 12:34:56
Type: INFO, Date: 2025-06-09, Time: 12:35:56


## 7.2

In [32]:
import openai  #A
import os  #B
from dotenv import load_dotenv  #C
from pydantic import BaseModel  #D
from typing import Optional  #E

# Load API key from .env file  #F
load_dotenv()  #G
openai.api_key = os.getenv("OPENAI_API_KEY")  #H

# Define the data model for extracted output  #I
class LogExtraction(BaseModel):  #J
    log_type: Optional[str]  #J.1
    date: Optional[str]  #K
    time: Optional[str]  #L

# Example log entries  #M
logs = [  #N
    "ERROR 2025-06-09 12:34:56 Server failed to respond",  #O
    "INFO 2025-06-09 12:35:56 User logged in",  #P
    "WARNING 2025-06-09 12:36:56 Disk space low"  #Q
]

# Prompt for AI to extract log_type, date, and time  #R
row_prompts = [  #S
    "You are a data extraction assistant. Extract the following from the log entry:\n"
    "- log_type: The type of log message (e.g., ERROR, INFO, WARNING)\n"
    "- date: Extract the date in YYYY-MM-DD format\n"
    "- time: Extract the time in HH:MM:SS format\n"
    "Return the result as a JSON object matching the LogExtraction structure."
    for log in logs  #T
]

# Process each log entry  #U
for log, prompt in zip(logs, row_prompts):  #V
    try:  #W
        # Make the API call  #X
        completion = openai.beta.chat.completions.parse(  #Y
            model="gpt-4o",  #Z
            messages=[  #AA
                {"role": "system", "content": prompt},  #AB
                {"role": "user", "content": log}  #AC
            ],  #AD
            response_format=LogExtraction  #AE
        )

        extracted = completion.choices[0].message.parsed.dict()  #AF
        print(extracted)  #AG

    except Exception as e:  #AH
        print(f"Error processing log entry: {e}")  #AI


{'log_type': 'ERROR', 'date': '2025-06-09', 'time': '12:34:56'}
{'log_type': 'INFO', 'date': '2025-06-09', 'time': '12:35:56'}


### json data

In [33]:
json_data = {
  "library": {
    "name": "City Library",
    "location": "Downtown",
    "books": [
      {
        "title": "Python Programming",
        "author": {
          "first_name": "John",
          "last_name": "Doe"
        },
        "genres": ["Programming", "Technology"],
        "published_year": 2020
      },
      {
        "title": "Data Science 101",
        "author": {
          "first_name": "Jane",
          "last_name": "Smith"
        },
        "genres": ["Data Science", "AI"],
        "published_year": 2019
      }
    ]
  }
}


## 7.3

In [18]:
import pandas as pd  #A

# Function to convert nested JSON to a DataFrame  #B
def json_to_dataframe(json_data):  #C
    """Convert nested JSON to a pandas DataFrame."""  #D
    records = []  #E
    for book in json_data['library']['books']:  #F
        record = {  #G
            'Library Name': json_data['library']['name'],  #H
            'Location': json_data['library']['location'],  #I
            'Title': book['title'],  #J
            'Author': f"{book['author']['first_name']} {book['author']['last_name']}",  #K
            'Genres': ', '.join(book['genres']),  #L
            'Published Year': book['published_year']  #M
        }
        records.append(record)  #N
    return pd.DataFrame(records)  #O

df = json_to_dataframe(json_data) #P
display(df) #Q

Unnamed: 0,Library Name,Location,Title,Author,Genres,Published Year
0,City Library,Downtown,Python Programming,John Doe,"Programming, Technology",2020
1,City Library,Downtown,Data Science 101,Jane Smith,"Data Science, AI",2019


## 7.4

In [38]:
import openai  #A
import os  #B
from dotenv import load_dotenv  #C
from pydantic import BaseModel  #D
import pandas as pd  #E

load_dotenv()  #F
openai.api_key = os.getenv("OPENAI_API_KEY")  #G

class LibraryBook(BaseModel):  #H
    library_name: str  #I
    location: str  #J
    title: str  #K
    author: str  #L
    genres: str  #M
    published_year: int  #N

system_prompt = f"Extract data to match this class:\n{LibraryBook.schema_json(indent=2)}"  #O

structured_data = []  #P

for book in json_data["library"]["books"]:  #Q
    payload = {  #R
        "library_name": json_data["library"]["name"],  #S
        "location": json_data["library"]["location"],  #T
        **book  #U
    }

    try:  #V
        completion = openai.beta.chat.completions.parse(  #W
            model="gpt-4o",  #X
            messages=[  #Y
                {"role": "system", "content": system_prompt},  #Z
                {"role": "user", "content": f"{payload}"}  #AA
            ],
            response_format=LibraryBook  #AB
        )
        structured_data.append(completion.choices[0].message.parsed.dict())  #AC
    except Exception as e:  #AD
        print(f"Error: {e}")  #AE

df = pd.DataFrame(structured_data)  #AF
display(df)  #AG


Unnamed: 0,library_name,location,title,author,genres,published_year
0,City Library,Downtown,Python Programming,John Doe,"Programming, Technology",2020
1,City Library,Downtown,Data Science 101,Jane Smith,"Data Science, AI",2019


### normalization and entity resolution data

In [39]:
customers = [
    {"name": "John Smith", "email": "john.smith@acme.com"},
    {"name": "Jane Smythe", "email": "jane.smythe@alpha.io"},
    {"name": "Jonathan Smith", "email": "jonathan@acme.com"},
    {"name": "Johnny S.", "email": "johnny@acmeco.com"}
]
incoming_email = "jonny_smith@acmeco.com"

# 7.5

In [41]:
import re  #A
from rapidfuzz import fuzz, process  #B

# Normalize the email by removing special characters and lowercasing  #C
def normalize_email(email):  #D
    prefix = email.split('@')[0]  #E
    normalized = re.sub(r'\W+', '', prefix.lower())  #F
    return normalized  #G

# Normalize the incoming email and customer emails  #H
normalized_incoming = normalize_email(incoming_email)  #I
customer_candidates = [  #J
    {
        "original": customer,  #K
        "normalized": normalize_email(customer["email"])  #L
    }
    for customer in customers  #M
]

# Score similarity using fuzzy matching  #N
matches = [  #O
    {
        "name": c["original"]["name"],  #P
        "email": c["original"]["email"],  #Q
        "score": fuzz.ratio(normalized_incoming, c["normalized"])  #R
    }
    for c in customer_candidates  #S
]

# Sort and print top match  #T
top_match = sorted(matches, key=lambda m: m["score"], reverse=True)[0]  #U
print(f"Best match: {top_match['name']} ({top_match['email']}) - Score: {top_match['score']}")  #V


Best match: John Smith (john.smith@acme.com) - Score: 80.0


## 7.6

In [44]:
import openai  #A
import os  #B
from dotenv import load_dotenv  #C
from pydantic import BaseModel  #D

load_dotenv()  #E
openai.api_key = os.getenv("OPENAI_API_KEY")  #F

# Define a structured output model for entity resolution  #G
class EmailMatch(BaseModel):  #H
    name: str  #I
    email: str  #J
    confidence: float  #K
    reasoning: str  #L

# Define the system prompt with resolution instruction  #M
system_prompt = """
You are an entity resolution assistant. A new customer email has arrived: 'jonny_smith@acmeco.com'.
Compare it to the list of known customers. Identify the best match based on email similarity and name inference.
Provide the closest match with a confidence score (0 to 1) and explain your reasoning.
""".strip()  #N

# Format the full input payload  #O
user_message = {
    "incoming_email": "jonny_smith@acmeco.com",
    "customers": customers
}  #P

# Call the OpenAI API with schema enforcement  #Q
completion = openai.beta.chat.completions.parse(  #R
    model="gpt-4o",  #S
    messages=[  #T
        {"role": "system", "content": system_prompt},  #U
        {"role": "user", "content": f"{user_message}"}  #V
    ],
    response_format=EmailMatch  #W
)

# Extract and display the structured response  #X
match = completion.choices[0].message.parsed.dict()  #Y

# Format and Print  #Z
print(f"Best match: {match['name']} ({match['email']})")
print(f"Confidence: {match['confidence']:.2f}")
print("Reasoning:")
print(match['reasoning']) 


Best match: Johnny S. (johnny@acmeco.com)
Confidence: 0.95
Reasoning:
The incoming email 'jonny_smith@acmeco.com' closely resembles 'johnny@acmeco.com' in both local part and domain. The local part 'jonny_smith' seems like a variation of 'johnny', which is a common nickname for 'John' or 'Jonathan', and shares the same last name structure. The domain '@acmeco.com' is identical, furthering the match strength. The slight spelling variation aside, this indicates a very high probability that both emails belong to the same customer, Johnny S.


### Time Series and Date-Time Transformations 

In [17]:
transactions = [
    {"account": "A001", "transaction_date": "2025-01-31T16:00:00Z", "terms": "NET30", "amount_due": 1200},
    {"account": "A001", "transaction_date": "2025-02-28T12:45:00Z", "terms": "NET60", "amount_due": 800},
    {"account": "A001", "transaction_date": "2025-03-15T09:30:00Z", "terms": "NET30", "amount_due": 1500},
    {"account": "A001", "transaction_date": "2025-06-01T11:00:00Z", "terms": "NET15", "amount_due": 950},
    {"account": "A001", "transaction_date": "2025-07-04T10:00:00Z", "terms": "NET30", "amount_due": 700},

    {"account": "A002", "transaction_date": "2025-01-10T14:00:00Z", "terms": "NET45", "amount_due": 300},
    {"account": "A002", "transaction_date": "2025-02-12T08:30:00Z", "terms": "NET30", "amount_due": 600},
    {"account": "A002", "transaction_date": "2025-03-29T17:15:00Z", "terms": "NET60", "amount_due": 1100},
    {"account": "A002", "transaction_date": "2025-04-20T13:00:00Z", "terms": "NET30", "amount_due": 950},
    {"account": "A002", "transaction_date": "2025-06-15T07:00:00Z", "terms": "NET30", "amount_due": 800},

    {"account": "A003", "transaction_date": "2025-03-01T18:30:00Z", "terms": "NET30", "amount_due": 400},
    {"account": "A003", "transaction_date": "2025-04-01T09:00:00Z", "terms": "NET90", "amount_due": 2500},
    {"account": "A003", "transaction_date": "2025-05-15T11:30:00Z", "terms": "NET30", "amount_due": 600},
    {"account": "A003", "transaction_date": "2025-07-01T08:00:00Z", "terms": "NET60", "amount_due": 1000},
    {"account": "A003", "transaction_date": "2025-08-05T16:45:00Z", "terms": "NET30", "amount_due": 750},

    {"account": "A004", "transaction_date": "2025-02-20T15:00:00Z", "terms": "NET15", "amount_due": 900},
    {"account": "A004", "transaction_date": "2025-03-15T10:45:00Z", "terms": "NET30", "amount_due": 850},
    {"account": "A004", "transaction_date": "2025-04-10T14:15:00Z", "terms": "NET45", "amount_due": 1200},
    {"account": "A004", "transaction_date": "2025-06-30T09:00:00Z", "terms": "NET30", "amount_due": 1000},
    {"account": "A004", "transaction_date": "2025-09-01T13:20:00Z", "terms": "NET60", "amount_due": 1300}
]


## 7.7

In [6]:
import pandas as pd  #A
from pandas.tseries.offsets import BDay  #B
import pytz  #C
from datetime import datetime  #D

# Load transaction data into a DataFrame  #E
df = pd.DataFrame(transactions)  #F
df["transaction_date"] = pd.to_datetime(df["transaction_date"], utc=True)  #G

# A: Plain date without time  #H
df["date_only"] = df["transaction_date"].dt.date  #I

# B–D: Timestamps converted to PST, EST, and GST respectively  #J
df["timestamp_pst"] = df["transaction_date"].dt.tz_convert("US/Pacific")  #K
df["timestamp_est"] = df["transaction_date"].dt.tz_convert("US/Eastern")  #L
df["timestamp_gst"] = df["transaction_date"].dt.tz_convert("Asia/Dubai")  #M

# E: Extracted month and year for reporting breakdowns  #N
df["month"] = df["transaction_date"].dt.month  #O
df["year"] = df["transaction_date"].dt.year  #P

# F: Custom fiscal quarter based on internal calendar (Q1 = Feb–Apr)  #Q
def get_fiscal_quarter(date):  #R
    fiscal_month = (date.month - 1) % 12 + 1  #S
    if fiscal_month in [2, 3, 4]: return "Q1"  #T
    elif fiscal_month in [5, 6, 7]: return "Q2"  #U
    elif fiscal_month in [8, 9, 10]: return "Q3"  #V
    else: return "Q4"  #W

df["fiscal_quarter"] = df["transaction_date"].apply(get_fiscal_quarter)  #X

# 2: Due date calculation using business days only  #Y
def get_due_date(row):  #Z
    term_days = int(row["terms"].replace("NET", ""))  #AA
    return row["transaction_date"] + BDay(term_days)  #AB

df["due_date"] = df.apply(get_due_date, axis=1)  #AC

# 3: Percent contribution of each transaction to its account's total balance  #AD
account_totals = df.groupby("account")["amount_due"].transform("sum")  #AE
df["contribution_pct"] = (df["amount_due"] / account_totals * 100).round(2)  #AF

# Display key columns for verification  #AG
display(df)

Unnamed: 0,account,transaction_date,terms,amount_due,date_only,timestamp_pst,timestamp_est,timestamp_gst,month,year,fiscal_quarter,due_date,contribution_pct
0,A001,2025-01-31 16:00:00+00:00,NET30,1200,2025-01-31,2025-01-31 08:00:00-08:00,2025-01-31 11:00:00-05:00,2025-01-31 20:00:00+04:00,1,2025,Q4,2025-03-14 16:00:00+00:00,23.3
1,A001,2025-02-28 12:45:00+00:00,NET60,800,2025-02-28,2025-02-28 04:45:00-08:00,2025-02-28 07:45:00-05:00,2025-02-28 16:45:00+04:00,2,2025,Q1,2025-05-23 12:45:00+00:00,15.53
2,A001,2025-03-15 09:30:00+00:00,NET30,1500,2025-03-15,2025-03-15 02:30:00-07:00,2025-03-15 05:30:00-04:00,2025-03-15 13:30:00+04:00,3,2025,Q1,2025-04-25 09:30:00+00:00,29.13
3,A001,2025-06-01 11:00:00+00:00,NET15,950,2025-06-01,2025-06-01 04:00:00-07:00,2025-06-01 07:00:00-04:00,2025-06-01 15:00:00+04:00,6,2025,Q2,2025-06-20 11:00:00+00:00,18.45
4,A001,2025-07-04 10:00:00+00:00,NET30,700,2025-07-04,2025-07-04 03:00:00-07:00,2025-07-04 06:00:00-04:00,2025-07-04 14:00:00+04:00,7,2025,Q2,2025-08-15 10:00:00+00:00,13.59
5,A002,2025-01-10 14:00:00+00:00,NET45,300,2025-01-10,2025-01-10 06:00:00-08:00,2025-01-10 09:00:00-05:00,2025-01-10 18:00:00+04:00,1,2025,Q4,2025-03-14 14:00:00+00:00,8.0
6,A002,2025-02-12 08:30:00+00:00,NET30,600,2025-02-12,2025-02-12 00:30:00-08:00,2025-02-12 03:30:00-05:00,2025-02-12 12:30:00+04:00,2,2025,Q1,2025-03-26 08:30:00+00:00,16.0
7,A002,2025-03-29 17:15:00+00:00,NET60,1100,2025-03-29,2025-03-29 10:15:00-07:00,2025-03-29 13:15:00-04:00,2025-03-29 21:15:00+04:00,3,2025,Q1,2025-06-20 17:15:00+00:00,29.33
8,A002,2025-04-20 13:00:00+00:00,NET30,950,2025-04-20,2025-04-20 06:00:00-07:00,2025-04-20 09:00:00-04:00,2025-04-20 17:00:00+04:00,4,2025,Q1,2025-05-30 13:00:00+00:00,25.33
9,A002,2025-06-15 07:00:00+00:00,NET30,800,2025-06-15,2025-06-15 00:00:00-07:00,2025-06-15 03:00:00-04:00,2025-06-15 11:00:00+04:00,6,2025,Q2,2025-07-25 07:00:00+00:00,21.33


## 7.8

In [18]:
import openai  #A
import os  #B
from dotenv import load_dotenv  #C
from pydantic import BaseModel  #D
from datetime import datetime  #E
import pandas as pd  #F

load_dotenv()  #G
openai.api_key = os.getenv("OPENAI_API_KEY")  #H

# Define response schema using Pydantic  #I
class TransformedTransaction(BaseModel):  #J
    account: str  #K
    transaction_date: str  #L
    date_only: str  #M
    timestamp_pst: str  #N
    timestamp_est: str  #O
    timestamp_gst: str  #P
    month: int  #Q
    year: int  #R
    fiscal_quarter: str  #S
    due_date: str  #T
    contribution_pct: float  #U

# Compute account totals for contribution_pct  #W
df = pd.DataFrame(transactions)  #X
account_totals = df.groupby("account")["amount_due"].sum().to_dict()  #Y

# Define system prompt template  #Z
system_prompt = f"""
You are a data transformation assistant. You will receive one transaction at a time and must return a single object matching this schema:
{TransformedTransaction.schema_json(indent=2)}

The transaction will contain:
- 'transaction_date': an ISO-8601 timestamp in UTC
- 'terms': in format like 'NET30', meaning due in 30 **business days**
- 'account': account name
- 'amount_due': numeric value in USD
- 'account_total': total balance due for the account (used for calculating percentage contribution)

For each transaction:
- Extract the date without time
- Convert the timestamp to PST, EST, and GST
- Extract the month and year
- Determine fiscal quarter using a custom calendar: Q1 = Feb-Apr, Q2 = May-Jul, Q3 = Aug-Oct, Q4 = Nov-Jan
- Calculate due date by adding the NET days as business days to the transaction date
- Calculate contribution percentage = (amount_due / account_total) * 100

Return a JSON object that matches the schema exactly.
""".strip()  #AA

# Collect results  #AB
results = []  #AC

for tx in transactions:  #AD
    payload = tx.copy()  #AE
    payload["account_total"] = account_totals[tx["account"]]  #AF

    try:  #AG
        completion = openai.beta.chat.completions.parse(  #AH
            model="gpt-4o",  #AI
            messages=[  #AJ
                {"role": "system", "content": system_prompt},  #AK
                {"role": "user", "content": f"{payload}"}  #AL
            ],
            response_format=TransformedTransaction  #AM
        )
        results.append(completion.choices[0].message.parsed.dict())  #AN
    except Exception as e:  #AO
        print(f"Error: {e}")  #AP

# Final result as DataFrame  #AQ
final_df = pd.DataFrame(results)  #AR
display(final_df)  #AS


Unnamed: 0,account,transaction_date,date_only,timestamp_pst,timestamp_est,timestamp_gst,month,year,fiscal_quarter,due_date,contribution_pct
0,A001,2025-01-31T16:00:00Z,2025-01-31,2025-01-31T08:00:00-08:00,2025-01-31T11:00:00-05:00,2025-01-31T20:00:00+04:00,1,2025,Q4,2025-03-14,23.3
1,A001,2025-02-28T12:45:00Z,2025-02-28,2025-02-28T04:45:00-08:00,2025-02-28T07:45:00-05:00,2025-02-28T16:45:00+04:00,2,2025,1,2025-05-05,15.534951
2,A001,2025-03-15T09:30:00Z,2025-03-15,2025-03-15T02:30:00-07:00,2025-03-15T05:30:00-04:00,2025-03-15T13:30:00+04:00,3,2025,Q1,2025-04-25,29.126214
3,A001,2025-06-01T11:00:00Z,2025-06-01,2025-06-01T04:00:00-07:00,2025-06-01T07:00:00-04:00,2025-06-01T15:00:00+04:00,6,2025,Q2,2025-06-20,18.446602
4,A001,2025-07-04T10:00:00Z,2025-07-04,2025-07-04T03:00:00-07:00,2025-07-04T06:00:00-04:00,2025-07-04T14:00:00+04:00,7,2025,Q2,2025-08-14,13.592233
5,A002,2025-01-10T14:00:00Z,2025-01-10,2025-01-10T06:00:00-08:00,2025-01-10T09:00:00-05:00,2025-01-10T18:00:00+04:00,1,2025,Q4,2025-03-11,8.0
6,A002,2025-02-12T08:30:00Z,2025-02-12,2025-02-12T00:30:00-08:00,2025-02-12T03:30:00-05:00,2025-02-12T12:30:00+04:00,2,2025,Q1,2025-03-26,16.0
7,A002,2025-03-29T17:15:00Z,2025-03-29,2025-03-29T10:15:00-07:00,2025-03-29T13:15:00-04:00,2025-03-29T21:15:00+04:00,3,2025,Q1,2025-06-20,29.333333
8,A002,2025-04-20T13:00:00Z,2025-04-20,2025-04-20T06:00:00-07:00,2025-04-20T09:00:00-04:00,2025-04-20T17:00:00+04:00,4,2025,Q1,2025-06-02,25.333333
9,A002,2025-06-15T07:00:00Z,2025-06-15,2025-06-15T00:00:00-07:00,2025-06-15T03:00:00-04:00,2025-06-15T11:00:00+04:00,6,2025,Q2,2025-07-28,21.333333
