# Lab

##### Objective : The main purpose behind this lab is to get familiar with NLP Rule based and Regex.

## Part 1: Rule Based NLP and Regex:

Using Regex write a python code that can generate bill from a text given by user.
Use case:
“I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a
kilogram and one Hamburger with 4,5 dollar”.

Generated Bill :

Product Quantity Unit Price Total Price 

Samsung smart phone 3 150 450

Banana 4 1,2 3,2

Hamburger 1 4,5 4,5

In [130]:
from nltk.corpus import stopwords

def preprocess_text(text):
    # Define a list of stop words and unnecessary phrases to remove
    stop_words = set(stopwords.words('english'))
    unnecessary_phrases = ['for', 'each', 'a', 'an', 'the', 'with', 'I', 'of', 'dollar']
    measurements = ['kilos', 'kilograms', 'kilogram', 'grams', 'pounds', 'liters', 'milliliters', 'ounces', 'gallons', 'units']

    # Split the text into words
    words = text.split()

    # Remove stop words and unnecessary phrases
    preprocessed_words = [word for word in words if word not in stop_words and word not in unnecessary_phrases and word not in measurements]

    # Join the words back into a text string
    preprocessed_text = ' '.join(preprocessed_words)

    return preprocessed_text

### 1- Python Regex

In [131]:
import re
from word2number import w2n

def generate_bill(text):
    # Preprocess the text
    text = preprocess_text(text)

    # Define regular expressions to match quantities, products, and prices
    pattern = r'(\b\d+|\b(?:one|two|three|four|five|six|seven|eight|nine)) ([\w\s]+) (\d+(?:,\d+)?)'

    # Find all matches of quantities, products, and prices
    matches = re.findall(pattern, text)

    # Format the bill
    bill = "Generated Bill:\n\n"
    bill += "{:<20} {:<10} {:<10} {}\n".format("Product", "Quantity", "Unit Price", "Total Price")
    total_bill = 0

    for match in matches:
        quantity, product, price = match
        quantity = w2n.word_to_num(quantity) if quantity.isalpha() else int(quantity)  # Convert words to numbers
        price = float(price.replace(',', '.'))
        total_price = quantity * price
        total_bill += total_price
        bill += "{:<20} {:<10} {:<10} {}\n".format(product, quantity, price, round(total_price, 2))

    bill += "\nTotal Bill: {}".format(round(total_bill, 2))
    return bill

# Example usage
text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"
print(generate_bill(text))

Generated Bill:

Product              Quantity   Unit Price Total Price
Samsung smartphones  3          150.0      450.0
fresh banana         4          1.2        4.8
Hamburger            1          4.5        4.5

Total Bill: 459.3


### 2- Similarity using Spacy.Matcher

In [132]:
import spacy
from spacy.matcher import Matcher
from word2number import w2n

def word_to_num(word):
    try:
        return w2n.word_to_num(word)
    except ValueError:
        return word

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Updated pattern to match a number, followed by one or more words, and then a price
pattern = [{"LIKE_NUM": True}, {"IS_ALPHA": True, "OP": "+"}, {"IS_SPACE": True, "OP": "?"}, {"LIKE_NUM": True}]
matcher.add("PRODUCT_PATTERN", [pattern])

text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"
preprocessed_text = preprocess_text(text)
doc = nlp(preprocessed_text)
matches = matcher(doc)

print("Generated Bill:")
print("Product\t\tQuantity\tUnit Price\tTotal Price")
total_bill = 0
for match_id, start, end in matches:
    span = doc[start:end]
    parts = span.text.split(" ")
    quantity = word_to_num(parts[0])

    # Handle the case where the price is a word (e.g., "one")
    if parts[-1].isalpha():
        price = word_to_num(parts[-1])  # Convert to number if possible
    else:
        price = float(parts[-1].replace(",", "."))

    # Ensure quantity is always a number
    quantity = word_to_num(quantity)

    # Only multiply if both quantity and price are numbers
    if isinstance(quantity, (int, float)) and isinstance(price, (int, float)):
        total_price = quantity * price
    else:
        total_price = price

    product = " ".join(parts[1:-1])
    
    # Check if the product name starts with a lowercase letter (indicating a separate product)
    if not product.split()[0].islower():
        print(f"{product}\t{quantity}\t{price}\t{total_price}")
        total_bill += total_price

print("\nTotal Bill:", total_bill)


Generated Bill:
Product		Quantity	Unit Price	Total Price
Samsung smartphones	3	150.0	450.0
Hamburger	1	4.5	4.5

Total Bill: 454.5


In [133]:
import spacy
from spacy.matcher import Matcher
from word2number import w2n

def word_to_num(word):
    try:
        return w2n.word_to_num(word)
    except ValueError:
        return word

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"IS_ALPHA": False}, {"LIKE_NUM": True}, {"IS_ALPHA": True, "OP": "+"}, {"LIKE_NUM": True}]
matcher.add("PRODUCT_PATTERN", [pattern])

text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"
preprocessed_text = preprocess_text(text)
doc = nlp(preprocessed_text)
matches = matcher(doc)

print("Generated Bill:")
print("Product\t\tQuantity\tUnit Price\tTotal Price")
total_bill = 0
for match_id, start, end in matches:
    span = doc[start:end]
    parts = span.text.split(" ")
    quantity = word_to_num(parts[1])

    # Handle the case where the price is a word (e.g., "one")
    if parts[-1].isalpha():
        price = word_to_num(parts[-1])  # Convert to number if possible
    else:
        price = float(parts[-1].replace(",", "."))

    # Ensure quantity is always a number
    quantity = word_to_num(quantity)

    # Only multiply if both quantity and price are numbers
    if isinstance(quantity, (int, float)) and isinstance(price, (int, float)):
        total_price = quantity * price
    else:
        total_price = price

    product = " ".join(parts[2:-1])
    print(f"{product}\t{quantity}\t{price}\t{total_price}")
    total_bill += total_price

print("\nTotal Bill:", total_bill)


Generated Bill:
Product		Quantity	Unit Price	Total Price
fresh banana	4	1.2	4.8
Hamburger	1	4.5	4.5

Total Bill: 9.3


### 3- Matching Product names

In [134]:
def get_product_names(text):
    # Read the product list from the file
    with open('products.txt', 'r') as f:
        product_list = [line.strip().lower() for line in f]
    
    text = preprocess_text(text)
    
    # Convert the text to lowercase
    lower_text = text.lower()

    # Identify product names
    product_names = [product for product in product_list if product in lower_text]

    return product_names

# Example usage
text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"
print(get_product_names(text))

['fresh banana', 'hamburger', 'samsung smartphones']


In [135]:
def extract_info(text):
    # Preprocess the text
    text = preprocess_text(text)
    product_names = get_product_names(text)
    # Define a regex pattern for quantity (word form), product name, and price (number form)
    pattern = r'(\w+)\s+(' + '|'.join(map(re.escape, product_names)) + r')\s+([\d,]+(?:\.\d+)?)'
    # Find matches in the text
    matches = re.findall(pattern, text, re.IGNORECASE)
    # Convert quantity from word form to number form
    matches = [(w2n.word_to_num(quantity) if quantity.isalpha() and quantity in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'] else quantity, product, price) for quantity, product, price in matches if quantity.isdigit() or quantity in ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']]
    # Create the bill
    bill = "Generated Bill:\n\n"
    bill += "Product\t\t\t\tQuantity\tUnit Price\tTotal Price\n"
    total_bill = 0
    for quantity, product, price in matches:
        total_price = int(quantity) * float(price.replace(',', '.'))
        total_bill += total_price
        bill += f"{product}\t\t{quantity}\t\t{price}\t\t{total_price}\n"
    bill += f"Total Bill: {total_bill}"
    return bill


# Example usage
text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram and one Hamburger with 4,5 dollar"
print(extract_info(text))

Generated Bill:

Product				Quantity	Unit Price	Total Price
Samsung smartphones		3		150		450.0
fresh banana		4		1,2		4.8
Hamburger		1		4,5		4.5
Total Bill: 459.3
