# Chapter 6 Guide

## 6.1

In [1]:
import pandas as pd

# Create a sample DataFrame with inconsistent data
df = pd.DataFrame({
    'email': ['user1@example.com', 'user2@.com', '555-1234', 'user4@example.com'],  #A
    'age': [25, None, 30, 40],  #B
    'purchase_amount': [100.5, -50.0, None, 200.0]  #C
})

# Detect missing values in each column
missing_values = df.isnull().sum()  #D

# Detect negative values in the 'purchase_amount' column
negative_values = df[df['purchase_amount'] < 0]  #E

# Output the results
print("Missing Values:\n", missing_values)  #F
print("\nNegative Values:\n", negative_values)  #G


Missing Values:
 email              0
age                1
purchase_amount    1
dtype: int64

Negative Values:
         email  age  purchase_amount
1  user2@.com  NaN            -50.0


## 6.2

In [6]:
import openai
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY") 

# Define a function to detect inconsistencies using Open AI's Chat Completions API Endpoint
def detect_inconsistencies(df):
    discrepancies = {}  #A
    
    # Loop through each column in the DataFrame
    for col in df.columns:  #B
        # Create a prompt to ask Open AI's Chat Completions API Endpoint for inconsistencies in the column
        prompt = f"Identify any inconsistencies in the column '{col}' in this data: {df[col].tolist()}. Note that purchase amount should not be negative for any item."  #C
        
        # Send the prompt to Open AI's Chat Completions API Endpoint and store the response
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}]
        )  #D
        
        # Save the response for the column
        discrepancies[col] = response.choices[0].message.content.strip() #E
    
    return discrepancies  #F

# Create a sample DataFrame with inconsistent data
df = pd.DataFrame({
    'email': ['user1@example.com', 'user2@.com', '555-1234', 'user4@example.com'],  #G
    'age': [25, None, 30, 40],  #H
    'purchase_amount': [100.5, -50.0, None, 200.0]  #I
})

# Use the function to detect inconsistencies with AI assistance
discrepancies = detect_inconsistencies(df)  #J

# Output the detected inconsistencies
print("Detected Inconsistencies:", discrepancies)  #K


Detected Inconsistencies: {'email': "In the column 'email', there are a few inconsistencies:\n\n1. 'user2@.com': This email is likely invalid because there is a period immediately after the '@' symbol, which is not standard for email addresses.\n2. '555-1234': This appears to be a phone number rather than an email address, indicating a data entry error.\n\nThe purchase amount information is not included in the data you've provided, so I can't assess inconsistencies related to negative values without that information.", 'age': 'The request mentions checking the column \'age\' for inconsistencies and also references purchase amounts, which seems unrelated to the task at hand. Therefore, I\'ll focus on identifying inconsistencies in the \'age\' data provided: `[25.0, nan, 30.0, 40.0]`. Here\'s a breakdown:\n\n1. **25.0**: This is a valid age value.\n\n2. **nan**: \'nan\' stands for "Not a Number" and indicates missing data. This is an inconsistency if we expect every entry to have a valid

## 6.3

In [8]:
import pandas as pd

# Create a sample DataFrame
df = pd.DataFrame({
    'customer_id': [1, 2, 2, 3, 3],  #A
    'preferred_store_location': ['NY', 'CA', 'CA', 'TX', None],  #B
    'purchase_amount': [200, 300, 300, 400, 400],  #C
    'optional_note': [None, None, None, None, None]  #D
})

# Remove duplicate rows
df = df.drop_duplicates()  #E

# Remove columns that are primarily null (e.g., more than 50% null values)
threshold = 0.5 * len(df)  #F
df = df.loc[:, df.isnull().sum() <= threshold]  #G

# Output the cleaned DataFrame
print(df)  #H


   customer_id preferred_store_location  purchase_amount
0            1                       NY              200
1            2                       CA              300
3            3                       TX              400
4            3                     None              400


## 6.4

In [15]:
import pandas as pd
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define the response structure
class CleanedData(BaseModel):
    duplicates: List[int]
    drop_columns: List[str]

# Sample DataFrame
df = pd.DataFrame({
    'customer_id': [1, 2, 2, 3, 3],
    'preferred_store_location': ['NY', 'CA', 'CA', 'TX', None],
    'purchase_amount': [200, 300, 300, 400, 400],
    'optional_note': [None, None, None, None, None]
})

# Format the dataset
records = df.to_dict(orient="records")

# Prompt
prompt = (
    "You are a data cleaning assistant. Review the dataset and return the following:\n"
    "- A list of row indexes that are exact duplicates based on values.\n"
    "- A list of column names that contain more than 50% null values and should be dropped.\n"
    "Respond with only the required data for cleaning."
)

# Completion call with response_format
completion = openai.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": prompt},
        {"role": "user", "content": str(records)}
    ],
    response_format=CleanedData
)

# Extract structured output
cleaning_info = completion.choices[0].message.parsed

# Apply cleaning
df_cleaned = df.drop(index=cleaning_info.duplicates)
df_cleaned = df_cleaned.drop(columns=cleaning_info.drop_columns, errors='ignore')

# Show cleaned output
print(df_cleaned)


   customer_id preferred_store_location  purchase_amount
0            1                       NY              200
1            2                       CA              300
3            3                       TX              400
4            3                     None              400


## 6.5

In [20]:
import pandas as pd
from dateutil import parser  #A

# Create messy dataset from Bob's shop  #B
df = pd.DataFrame({  #C
    'purchase_date': ['12/31/2023', '2023-01-01', '01-15-2023'],  #D
    'sku': ['abc123', 'XYZ789', '123ABC'],  #E
    'product_description': ['red winter jacket - insulated and waterproof', 
                            'bindings - lightweight and durable',
                            'short snowboard - beginner friendly'],  #F
    'first_name': ['Ava', 'Leo', 'Riley'],  #G
    'last_name': ['Smith', 'Nguyen', 'Patel'],  #H
    'product_name': ['winter jacket', 'bindings', 'short snowboard']  #I
})

# Standardize purchase_date format to YYYY-MM-DD  #J
def normalize_date(val):  #K
    try:
        return parser.parse(val).strftime('%Y-%m-%d')  #L
    except Exception:
        return None  #M

df['purchase_date'] = df['purchase_date'].apply(normalize_date)  #N

# Enforce SKU format (3 uppercase letters + 3 digits)  #O
df['sku'] = df['sku'].str.upper().str.extract(r'([A-Z]{3}\d{3})', expand=False)  #P

# Truncate product_description to 20 characters  #Q
df['product_description'] = df['product_description'].str[:20]  #R

# Concatenate first and last names into full_name  #S
df['full_name'] = df['first_name'] + ' ' + df['last_name']  #T

# Map product_name to standardized categories  #U
category_map = {
    'winter jacket': 'outerwear',
    'bindings': 'gear',
    'short snowboard': 'boards'
}
df['product_category'] = df['product_name'].map(category_map)  #V

# Print cleaned DataFrame  #W
display(df)  #X


Unnamed: 0,purchase_date,sku,product_description,first_name,last_name,product_name,full_name,product_category
0,2023-12-31,ABC123,red winter jacket -,Ava,Smith,winter jacket,Ava Smith,outerwear
1,2023-01-01,XYZ789,bindings - lightweig,Leo,Nguyen,bindings,Leo Nguyen,gear
2,2023-01-15,,short snowboard - be,Riley,Patel,short snowboard,Riley Patel,boards


## 6.6

In [25]:
import pandas as pd
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define the structured response format  #A
class StandardizationInstructions(BaseModel):  #B
    normalized_dates: List[str]                #C
    cleaned_skus: List[Optional[str]]          #D
    truncated_descriptions: List[str]          #E
    full_names: List[str]                      #F
    mapped_categories: List[str]               #G

# Create messy dataset from Bob’s shop  #H
df = pd.DataFrame({  #I
    'purchase_date': ['12/31/2023', '2023-01-01', '01-15-2023'],  #J
    'sku': ['abc123', 'XYZ789', '123ABC'],  #K
    'product_description': [
        'red winter jacket - insulated and waterproof', 
        'bindings - lightweight and durable',
        'short snowboard - beginner friendly'
    ],  #L
    'first_name': ['Ava', 'Leo', 'Riley'],  #M
    'last_name': ['Smith', 'Nguyen', 'Patel'],  #N
    'product_name': ['winter jacket', 'bindings', 'short snowboard']  #O
})

# Format the dataset for the prompt  #P
records = df.to_dict(orient="records")  #Q

# Compose prompt to request all cleaning instructions in one shot  #R
prompt = (
    "You are a data cleaning assistant. Given a dataset, return the following lists with values that match the row order exactly:\n"
    "- A list of normalized purchase_date values in YYYY-MM-DD format.\n"
    "- A list of SKUs that match the format: 3 uppercase letters followed by 3 digits. If the SKU is invalid, return null.\n"
    "- A list of product_description values truncated to 20 characters.\n"
    "- A list of full_names by combining first_name and last_name.\n"
    "- A list of standardized product categories mapped from product_name. Use one of: outerwear, gear, boards.\n"
    "Each list must contain exactly one value per row, and values must be in the same order as the input records."
)


# Call OpenAI API with structured response  #S
completion = openai.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": prompt},  #T
        {"role": "user", "content": str(records)}  #U
    ],
    response_format=StandardizationInstructions  #V
)

# Parse structured output  #W
cleaned = completion.choices[0].message.parsed  #X

# Apply cleaned values to the original DataFrame  #Y
df['purchase_date'] = cleaned.normalized_dates
df['sku'] = cleaned.cleaned_skus
df['product_description'] = cleaned.truncated_descriptions
df['full_name'] = cleaned.full_names
df['product_category'] = cleaned.mapped_categories

# Show the cleaned DataFrame  #Z
print(df)  #AA


  purchase_date     sku    product_description first_name last_name  \
0    2023-12-31    null    red winter jacket -        Ava     Smith   
1    2023-01-01  XYZ789  bindings - lightweigh        Leo    Nguyen   
2    2023-01-15    null  short snowboard - beg      Riley     Patel   

      product_name    full_name product_category  
0    winter jacket    Ava Smith        outerwear  
1         bindings   Leo Nguyen             gear  
2  short snowboard  Riley Patel           boards  
