In [2]:
import pandas as pd
import numpy as np
import re

# Load the processed data from previous notebook
df = pd.read_csv('../../data/raw/Retail_Sales_Data.csv' )

# Quick inspection
print(df.head())
print(df.info())


  retailer  store_id  week                   product  \
0  retail1         1     1       MintyFresh Mint 18g   
1  retail1         1     1    ChocoDelight Dark 200g   
2  retail1         1     1   ChocoDelight White 350g   
3  retail1         1     1  NuttyCream Hazelnuts 80g   
4  retail1         1     1        DarkDream Dark 60g   

                                         description  regular_price  \
0                    MintyFresh refresh mint 18grams      10.308645   
1                           200g ChocoDelight smooth       7.947956   
2                        350g ChocoDelight delicious      13.439368   
3                delightful Hazelnuts 80g NuttyCream       5.112777   
4  A decadent DarkDream dark chocolate ice cream ...      14.085425   

   competition_1_regular_price  competition_2_regular_price  \
0                    10.624815                    10.013451   
1                     7.748726                     7.589198   
2                    13.751508                 

In [None]:

product_info_df = df[['description', 'product']].copy()

In [4]:
# Print the first 10 descriptions and their corresponding product names
for i in range(10):
    description = df.loc[i, 'description']
    product = df.loc[i, 'product']
    print(f"Example {i + 1}:")
    print(f"Description: {description}")
    print(f"Product: {product}")
    print("-----------")

Example 1:
Description: MintyFresh refresh mint 18grams
Product: MintyFresh Mint 18g
-----------
Example 2:
Description: 200g ChocoDelight smooth
Product: ChocoDelight Dark 200g
-----------
Example 3:
Description: 350g ChocoDelight delicious
Product: ChocoDelight White 350g
-----------
Example 4:
Description: delightful Hazelnuts 80g NuttyCream
Product: NuttyCream Hazelnuts 80g
-----------
Example 5:
Description: A decadent DarkDream dark chocolate ice cream with a weight of 60 grams.
Product: DarkDream Dark 60g
-----------
Example 6:
Description: MintyFresh refresh mint 18grams
Product: MintyFresh Mint 18g
-----------
Example 7:
Description: A rich and smooth ChocoDelight dark chocolate bar weighing (150+50) 200 grams.
Product: ChocoDelight Dark 200g
-----------
Example 8:
Description: ChocoDelight White 350g creamy
Product: ChocoDelight White 350g
-----------
Example 9:
Description: Hazelnuts NuttyCream 80g  delightful
Product: NuttyCream Hazelnuts 80g
-----------
Example 10:
Descrip

### LLM Implementation with Haiku

In [None]:
import anthropic
import time
import pandas as pd

ANTHROPIC_API_KEY = "YOUR_API_KEY"

# Few-shot examples to guide Claude
few_shot_examples = """
Example 1:
Description: MintyFresh refresh mint 18grams
Product: MintyFresh Mint 18g
-----------

Example 2:
Description: 200g ChocoDelight smooth
Product: ChocoDelight Dark 200g
-----------

Example 3:
Description: 350g ChocoDelight delicious
Product: ChocoDelight White 350g
-----------

Example 4:
Description: delightful Hazelnuts 80g NuttyCream
Product: NuttyCream Hazelnuts 80g
-----------

Example 5:
Description: A decadent DarkDream dark chocolate ice cream with a weight of 60 grams.
Product: DarkDream Dark 60g
-----------

Example 6:
Description: MintyFresh refresh mint 18grams
Product: MintyFresh Mint 18g
-----------

Example 7:
Description: A rich and smooth ChocoDelight dark chocolate bar weighing (150+50) 200 grams.
Product: ChocoDelight Dark 200g
-----------

Example 8:
Description: ChocoDelight White 350g creamy
Product: ChocoDelight White 350g
-----------

Example 9:
Description: Hazelnuts NuttyCream 80g delightful
Product: NuttyCream Hazelnuts 80g
-----------

Example 10:
Description: A decadent DarkDream dark chocolate ice cream with a weight of 60 grams.
Product: DarkDream Dark 60g
"""

def claude_extract_product_info(df, model="claude-3-haiku-20240307"):
    info_df = df[['description']].copy()
    
    # Add a column for the extracted Product information
    info_df['Product'] = None

    # Initialize Anthropic client
    client = anthropic.Client(api_key=ANTHROPIC_API_KEY)

    # Loop through descriptions and extract information
    for i, desc in enumerate(info_df['description']):
        prompt = prompt = f"""
                        {few_shot_examples}

                        For the following description, extract only the brand, product type, and weight. Avoid descriptors like "delicious," "tasty," or any adjective unrelated to the flavour or type (such as "mint," "dark," "white," or "hazelnut"). Format the response as shown in the examples without adding "Product:" or any explanation.

                        Description: "{desc}"

                        Return in this exact format:
                        "[Brand] [Type or Flavour] [Weight]"
                        """

        
        try:
            # Create message request to Claude
            response = client.messages.create(
                model=model,
                messages=[
                    {"role": "user", "content": prompt},
                    {"role": "assistant", "content": ""}
                ],
                stop_sequences=["-----------"],
                max_tokens=100,
                temperature=0
            )
            
            # Get response text
            product_text = response.content[0].text
            print(f"Processed Product for Description {i}: \n{product_text}\n")

            # Store the product name in the Product column
            info_df.at[i, 'Product'] = product_text

            # Add delay to avoid rate limit issues
            time.sleep(1.3)

        except Exception as e:
            print(f"Error processing description at index {i}: {e}")
            continue

    return info_df

# Run the function
extracted_info_df = claude_extract_product_info(df[:1000])
print(extracted_info_df.head())

Processed Product for Description 0: 
MintyFresh Mint 18g

Processed Product for Description 1: 
ChocoDelight Dark 200g

Processed Product for Description 2: 
ChocoDelight Dark 350g

Processed Product for Description 3: 
NuttyCream Hazelnuts 80g

Processed Product for Description 4: 
DarkDream Dark 60g

Processed Product for Description 5: 
MintyFresh Mint 18g

Processed Product for Description 6: 
ChocoDelight Dark 200g

Processed Product for Description 7: 
ChocoDelight White 350g

Processed Product for Description 8: 
NuttyCream Hazelnuts 80g

Processed Product for Description 9: 
DarkDream Dark 60g

Processed Product for Description 10: 
MintyFresh Mint 18g

Processed Product for Description 11: 
ChocoDelight Dark 200g

Processed Product for Description 12: 
ChocoDelight 350g

Processed Product for Description 13: 
NuttyCream Hazelnuts 80g

Processed Product for Description 14: 
DarkDream Dark 60g

Processed Product for Description 15: 
MintyFresh Mint 18g

Processed Product for De

In [6]:
print(extracted_info_df.head())

                                         description                   Product
0                    MintyFresh refresh mint 18grams       MintyFresh Mint 18g
1                           200g ChocoDelight smooth    ChocoDelight Dark 200g
2                        350g ChocoDelight delicious    ChocoDelight Dark 350g
3                delightful Hazelnuts 80g NuttyCream  NuttyCream Hazelnuts 80g
4  A decadent DarkDream dark chocolate ice cream ...        DarkDream Dark 60g


In [None]:
import pandas as pd



# Save extracted product information to CSV
output_path = '../../data/processed/extracted_product_info.csv'
extracted_info_df.to_csv(output_path, index=False)
print(f"Extracted product information saved to {output_path}")

# Load the original data for comparison
original_df = pd.read_csv('../../data/raw/Retail_Sales_Data.csv')

# Take only the first 1000 rows to match the extracted data
original_df_1000 = original_df.head(1000)

# Merge the original and extracted DataFrames to facilitate comparison
comparison_df = original_df_1000[['product']].reset_index(drop=True).copy()
comparison_df['Extracted_Product'] = extracted_info_df['Product']

# Check accuracy by comparing original and extracted product names
comparison_df['Match'] = comparison_df['product'] == comparison_df['Extracted_Product']

# Calculate accuracy
accuracy = comparison_df['Match'].mean() * 100
print(f"Accuracy of extracted product names: {accuracy:.2f}%")

# Display a sample of mismatches for review
mismatches = comparison_df[~comparison_df['Match']]
print("Sample mismatches:")
print(mismatches.head(10))


Extracted product information saved to ../../data/processed/extracted_product_info.csv
Accuracy of extracted product names: 88.80%
Sample mismatches:
                    product       Extracted_Product  Match
2   ChocoDelight White 350g  ChocoDelight Dark 350g  False
12  ChocoDelight White 350g       ChocoDelight 350g  False
17  ChocoDelight White 350g  ChocoDelight Dark 350g  False
22  ChocoDelight White 350g  ChocoDelight Dark 350g  False
32  ChocoDelight White 350g       ChocoDelight 350g  False
42  ChocoDelight White 350g  ChocoDelight Dark 350g  False
57  ChocoDelight White 350g  ChocoDelight Dark 350g  False
77  ChocoDelight White 350g       ChocoDelight 350g  False
82  ChocoDelight White 350g       ChocoDelight 350g  False
92  ChocoDelight White 350g       ChocoDelight 350g  False


In [22]:
# Create a subset of the data to work with indices from 10 to 1000
comparison_df = product_info_df[['description', 'product']].copy()
comparison_df['Extracted_Product'] = extracted_info_df['Product']  # Ensure alignment with the extracted info

# Filter only the rows between index 10 and 1000
comparison_df = comparison_df.iloc[10:1000].copy()  # .iloc includes 10 and excludes 1001

# Add a Match column to indicate if the extracted product matches the original
comparison_df['Match'] = comparison_df['product'] == comparison_df['Extracted_Product']

# Filter the DataFrame to show only mismatches within the specified range
mismatches_df = comparison_df[comparison_df['Match'] == False]

# Display mismatches
print("Mismatched entries between index 10 and 1000:")
display(mismatches_df)



Mismatched entries between index 10 and 1000:


Unnamed: 0,description,product,Extracted_Product,Match
12,creamy ChocoDelight 350g,ChocoDelight White 350g,ChocoDelight 350g,False
17,350g ChocoDelight delicious,ChocoDelight White 350g,ChocoDelight Dark 350g,False
22,350g ChocoDelight delicious,ChocoDelight White 350g,ChocoDelight Dark 350g,False
32,creamy ChocoDelight 350g,ChocoDelight White 350g,ChocoDelight 350g,False
42,350g ChocoDelight delicious,ChocoDelight White 350g,ChocoDelight Dark 350g,False
...,...,...,...,...
937,creamy ChocoDelight 350g,ChocoDelight White 350g,ChocoDelight 350g,False
962,creamy ChocoDelight 350g,ChocoDelight White 350g,ChocoDelight 350g,False
972,350g ChocoDelight delicious,ChocoDelight White 350g,ChocoDelight Dark 350g,False
977,350g ChocoDelight delicious,ChocoDelight White 350g,ChocoDelight Dark 350g,False


In [23]:
# Filter mismatches within the specified range
mismatches_df = comparison_df[comparison_df['Match'] == False]

# Count unique mismatched extracted products
unique_mis_extracted = mismatches_df['Extracted_Product'].value_counts()

# Display the unique mis-extracted products and their counts
print("Unique mis-extracted products and their counts:")
print(unique_mis_extracted)


Unique mis-extracted products and their counts:
ChocoDelight Dark 350g    56
ChocoDelight 350g         55
Name: Extracted_Product, dtype: int64


In [24]:
# Filter to get correctly matched rows
correct_matches = comparison_df[comparison_df['Match'] == True]

# Count the unique correctly extracted products
unique_correct_counts = correct_matches['Extracted_Product'].value_counts()

# Display the unique correctly extracted products and their counts
print("Unique correctly extracted products and their counts:")
print(unique_correct_counts)


Unique correctly extracted products and their counts:
MintyFresh Mint 18g         198
ChocoDelight Dark 200g      198
NuttyCream Hazelnuts 80g    198
DarkDream Dark 60g          198
ChocoDelight White 350g      87
Name: Extracted_Product, dtype: int64


### SpaCy and RegEx Example

In [26]:
import spacy

# Load spaCy's pre-trained NER model
nlp = spacy.load("en_core_web_sm")


ModuleNotFoundError: No module named 'spacy'

In [None]:
from collections import Counter
import spacy

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# List to store all adjectives that could be colors
potential_colors = []

# Loop through descriptions to identify color-like adjectives
for description in df['description']:
    doc = nlp(description)
    for token in doc:
        if token.pos_ == "ADJ":  # Look for adjectives
            potential_colors.append(token.text.lower())

# Count the frequency of each adjective to identify common colors
color_counts = Counter(potential_colors)
print("Most common potential colors:", color_counts.most_common(20))


Most common potential colors: [('decadent', 1346), ('rich', 1333), ('creamy', 1329), ('delightful', 1313), ('dark', 894), ('delicious', 448), ('white', 448), ('smooth', 440)]


In [None]:
import spacy
import re

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# Known brands list (add more as needed)
known_brands = ["MintyFresh", "ChocoDelight", "NuttyCream", "DarkDream"]

def extract_product_info(description):
    doc = nlp(description)
    
    brand = None
    weight = None
    descriptors = []

    # Extract brand using NER and known brands list as fallback
    for ent in doc.ents:
        if ent.label_ == "ORG" and ent.text in known_brands:
            brand = ent.text
            break

    if not brand:
        brand = next((b for b in known_brands if b in description), None)
    
    # Enhanced Weight Extraction to include variations
    weight_match = re.search(r'\b(\d+)\s*(g|gram|grams|kg|ml)\b', description, re.IGNORECASE)
    if weight_match:
        weight, unit = weight_match.groups()
        weight = int(weight)
        if unit.lower() in ['kg', 'kilogram', 'kilograms']:
            weight *= 1000  # Convert to grams

    # Collect multiple adjectives
    for token in doc:
        if token.pos_ == "ADJ" or token.dep_ == "amod":  # Capture descriptive adjectives
            descriptors.append(token.text.lower())

    # Sort descriptors for consistent ordering
    descriptors = sorted(descriptors)

    return pd.Series([brand, weight, descriptors])

# Apply the function to extract information
df[['brand', 'weight', 'descriptors']] = df['description'].apply(extract_product_info)

# Review the results
print(df[['description', 'brand', 'weight', 'descriptors']].sample(10))




                                            description         brand  weight  \
158                     80g NuttyCream flavor Hazelnuts    NuttyCream    80.0   
5142                     ChocoDelight White 350g creamy  ChocoDelight   350.0   
1792  A creamy and delicious ChocoDelight white choc...  ChocoDelight   350.0   
3156                           200g ChocoDelight smooth  ChocoDelight   200.0   
639                         Dark DarkDream 60g decadent     DarkDream    60.0   
1192                     ChocoDelight White 350g creamy  ChocoDelight   350.0   
3530                    MintyFresh refresh mint 18grams    MintyFresh    18.0   
1250  A refreshing MintyFresh candy with a weight of...    MintyFresh     NaN   
5456                             rich ChocoDelight 200g  ChocoDelight   200.0   
2184  A decadent DarkDream dark chocolate ice cream ...     DarkDream    60.0   

                     descriptors  
158                           []  
5142                    [creamy]  
179

In [None]:
print(df[['description', 'brand', 'weight', 'descriptors']].sample(10))

                                            description         brand  weight  \
3039                                 60g Dark DarkDream     DarkDream    60.0   
7676  A rich and smooth ChocoDelight dark chocolate ...  ChocoDelight   200.0   
787                            creamy ChocoDelight 350g  ChocoDelight   350.0   
4879                        Dark DarkDream 60g decadent     DarkDream    60.0   
4552                           creamy ChocoDelight 350g  ChocoDelight   350.0   
3227                     ChocoDelight White 350g creamy  ChocoDelight   350.0   
2096                        ChocoDelight Dark 200g rich  ChocoDelight   200.0   
2413                delightful Hazelnuts 80g NuttyCream    NuttyCream    80.0   
6950                    MintyFresh refresh mint 18grams    MintyFresh    18.0   
1753                    80g NuttyCream flavor Hazelnuts    NuttyCream    80.0   

               descriptors  
3039                    []  
7676  [dark, rich, smooth]  
787               [cr

                                            description         brand  weight
1807                        350g ChocoDelight delicious  ChocoDelight   350.0
755                              MintyFresh 18g refresh    MintyFresh    18.0
801                            200g ChocoDelight smooth  ChocoDelight   200.0
120                              MintyFresh 18g refresh    MintyFresh    18.0
168                Hazelnuts NuttyCream 80g  delightful    NuttyCream    80.0
1606                           200g ChocoDelight smooth  ChocoDelight   200.0
776                            200g ChocoDelight smooth  ChocoDelight   200.0
4365                             MintyFresh 18g refresh    MintyFresh    18.0
3981                             rich ChocoDelight 200g  ChocoDelight   200.0
6179  A decadent DarkDream dark chocolate ice cream ...     DarkDream    60.0


In [None]:
# Review unique brands
print("Unique brands:", df['brand'].unique())

# Review unique weights
print("Unique weights:", df['weight'].unique())


Unique brands: ['MintyFresh' 'ChocoDelight' 'NuttyCream' 'DarkDream']
Unique weights: [ 18. 200. 350.  80.  60.  nan]


In [None]:
# Flag unusual weights
invalid_weights = df[(df['weight'] < 1) | (df['weight'] > 5000)]
print("Rows with unusual weights:")
print(invalid_weights[['description', 'brand', 'weight']])

# Check for rare brands that might be extraction errors
brand_counts = df['brand'].value_counts()
rare_brands = brand_counts[brand_counts < 3].index
print("Rare brands that might be errors:", rare_brands)

# Filter rows with rare brands for review
print(df[df['brand'].isin(rare_brands)][['description', 'brand', 'weight']])


Rows with unusual weights:
Empty DataFrame
Columns: [description, brand, weight]
Index: []
Rare brands that might be errors: Index([], dtype='object', name='brand')
Empty DataFrame
Columns: [description, brand, weight]
Index: []


In [None]:
# Convert `brand` to categorical numbers
df['brand_category'] = pd.factorize(df['brand'])[0]

# Convert `descriptors` list to a unique string for factorization
df['descriptors_str'] = df['descriptors'].apply(lambda x: ', '.join(x) if x else None)
df['descriptors_category'] = pd.factorize(df['descriptors_str'])[0]

# Drop the intermediate `descriptors_str` and the original `brand` and `descriptors` columns
df.drop(columns=['descriptors_str', 'brand', 'descriptors'], inplace=True)

# Review the result
print(df.head(10))

  retailer  store_id  week                   product  regular_price     value  \
0  retail1         1     1       MintyFresh Mint 18g      10.308645  4.458621   
1  retail1         1     1    ChocoDelight Dark 200g       7.947956  4.429920   
2  retail1         1     1   ChocoDelight White 350g      13.439368  3.980200   
3  retail1         1     1  NuttyCream Hazelnuts 80g       5.112777  3.899420   
4  retail1         1     1        DarkDream Dark 60g      14.085425  3.886054   
5  retail1         1     2       MintyFresh Mint 18g      16.415525  4.260473   
6  retail1         1     2    ChocoDelight Dark 200g      13.768042  4.007392   
7  retail1         1     2   ChocoDelight White 350g       9.337017  4.316094   
8  retail1         1     2  NuttyCream Hazelnuts 80g      17.867154  4.089784   
9  retail1         1     2        DarkDream Dark 60g      18.006656  4.408259   

     volume  quantity  competition_1_discount_depth  \
0  3.953882  2.370986                      5.168518  

In [None]:
# Export the DataFrame to a CSV with a unique name to avoid overwriting
df.to_csv('../../data/processed/processed_data_with_features.csv', index=False)


In [25]:
import pandas as pd

# Load the processed data with spaCy features
df_spacy = pd.read_csv('../../data/processed/processed_data_with_features.csv')

# Check if the relevant columns (e.g., 'brand', 'weight', 'flavor/type') exist
if all(col in df_spacy.columns for col in ['brand', 'weight', 'flavor']):
    # Construct the product name by combining 'brand', 'flavor', and 'weight'
    df_spacy['Reconstructed_Product'] = df_spacy['brand'] + ' ' + df_spacy['flavor'] + ' ' + df_spacy['weight']
else:
    raise KeyError("Columns 'brand', 'weight', or 'flavor' are missing in the loaded DataFrame.")

# Load original dataset with product names for comparison
df_original = pd.read_csv('../../data/raw/Retail_Sales_Data.csv')[['description', 'product']]

# Merge both dataframes on the common index for comparison
df_comparison = df_original[['product']].copy()
df_comparison['Extracted_Product'] = df_spacy['Reconstructed_Product']

# Add a column to indicate if the extracted product matches the original
df_comparison['Match'] = df_comparison['product'] == df_comparison['Extracted_Product']

# Calculate the accuracy
accuracy = df_comparison['Match'].mean() * 100

# Display the accuracy and sample mismatches
print(f"Accuracy of spaCy-extracted product names: {accuracy:.2f}%")

# Display sample mismatches if any
mismatches = df_comparison[df_comparison['Match'] == False][['product', 'Extracted_Product']]
print("Sample mismatches between original and spaCy-extracted product names:")
print(mismatches.head(10))


KeyError: "Columns 'brand', 'weight', or 'flavor' are missing in the loaded DataFrame."