For importing delta you need to make sure that you have selected the right kernel

If the below cell doesn't work, you might need to install the kernel first.

For doing so, follow these steps:

1. conda install ipykernel
2. python -m ipykernel install --user --name=bills --display-name "Python (bills)"

Now you should be able to select the Python (bills) kernell which will be pointing to the conda environment
containing the libraries for this project

In [None]:
import os
import re
from pathlib import Path
from pyspark.sql import SparkSession
from operator import itemgetter

from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

from pyspark.sql.functions import lit, input_file_name, split, col
import uuid
from datetime import datetime
from delta import *

In [None]:
!python --version

In [None]:
!which python

In [None]:
import sys

# Print the Python version
print(sys.version)

# Print only the major, minor, and micro versions
print(sys.version_info)

The above uses method configure_spark_with_delta_pip from the delta library which installs the necessaries
maven dependencies for the underlying pyspark process. The implementation of the function can be found here:

https://github.com/delta-io/delta/blob/da162a097a25524fc97334f47a180257cb487789/python/delta/pip_utils.py#L23

In [None]:
builder = SparkSession \
     .builder \
     .master("local[2]") \
     .config('spark.cores.max', '3') \
     .config('spark.executor.memory', '2g') \
     .config('spark.executor.cores', '2') \
     .config('spark.sql.catalogImplementation', 'hive') \
     .config('spark.driver.memory', '1g') \
     .config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension') \
     .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog') \
     .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
now = datetime.now()

spark \
	.read \
    .text("/Users/andreslaurito/repos/household-bills/household-bills/tickets", wholetext=True)\
    .withColumn("__LoadID", lit(str(uuid.uuid4()))) \
	.withColumn("__DCR", lit(str(now)).cast("timestamp")) \
    .withColumn("_file_name", input_file_name()) \
    .createOrReplaceTempView("raw_tickets")


In [None]:
# Note: Dia matches a lot of things bc of its meaning :)
supermarkets = ['mercadona', 'aldi', 'grupo dia',
                'primark', 'prenatal', 'zeeman',
                'condis', 'carrefour', 'consum',
                'miscota', 'farmacia', 'veritas',
                'lidl']


In [None]:
all_tickets = spark.sql("select * from raw_tickets")

all_tickets.show(truncate=False)

In [None]:
query = "SELECT * FROM raw_tickets where not ("
for supermarket in supermarkets:
    query += f"value ilike '%{supermarket}%' or "

query = query[:-4]
query += ')'

tickets_not_belonging_to_supermarkets = spark.sql(query)

In [None]:
tickets_not_belonging_to_supermarkets.show(truncate=False)

In [None]:
df_per_supermarkets = {
    supermarket: spark.sql(f"SELECT * FROM raw_tickets where value ilike '%{supermarket}%'")
    for supermarket in supermarkets
}

def total_spent(df):
    imports_per_ticket = []
    for row in df.select('splitted').collect():
        for word in row.splitted:
            if 'Import' in word:
                wordmatch = re.findall("\d+\,\d+", word)
                if wordmatch:
                    imports_per_ticket.append(float(wordmatch[0].replace(',', '.')))
        #         re.
        # ticket_import = [[0] for word in row.splitted if 'Import' in word]
        # imports_per_ticket.extend([float(importnum.replace(',', '.')) for importnum in ticket_import])
    return sum(imports_per_ticket)


info_per_supermarket = {
    supermarket: {
        'df': df,
        'count': df.count(),
        'df_splitted': df.withColumn('splitted', split(col('value'), '\r\n')),
        'total_spent': total_spent(df.withColumn('splitted', split(col('value'), '\r\n')))
    } for supermarket, df in df_per_supermarkets.items() }

In [None]:
info_per_supermarket

In [None]:
info_per_supermarket['mercadona']['df'].show(truncate=False)

In [None]:
supermarkets_most_tickets = sorted([(supermarket, value['count']) for supermarket, value in info_per_supermarket.items()],
                                   key=itemgetter(1), reverse=True)

supermarkets_most_spent = sorted([(supermarket, value['total_spent']) for supermarket, value in info_per_supermarket.items()],
                                  key=itemgetter(1), reverse=True)



In [None]:
supermarkets_most_tickets

In [None]:
supermarkets_most_spent

In [None]:
df_mercadona.show(truncate=False)

In [None]:
import re
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, DateType

# Sample input data
# Define the UDF function to extract ticket data
def extract_ticket_data(text):
    rows = []
    ticket_data = text.split("\r\n")
    recipe_date = ""
    for i in range(len(ticket_data)):
        product_quantity = ""
        product_name = ""
        unit_price = ""
        total_price = ""
        
        # Match pattern for product and name
        date_pattern = re.compile(r"\d+\/\d+\/\d+")
        regex_matched = date_pattern.search(ticket_data[i])
        if regex_matched:
            # Should match only once per recipe
            recipe_date = regex_matched[0]
            
        if re.match(r"\d \w+", ticket_data[i]):
            regex_match = re.match(r"(\d+)( )(\w+(\s\w+)*)", ticket_data[i])
            if regex_match:
                product_quantity = regex_match[1]
                product_name = regex_match[3]

                # Find unit and total price in subsequent lines
                unit_price_idx = i + 1
                total_price_idx = i + 2

                unit_price = ticket_data[unit_price_idx] if unit_price_idx < len(ticket_data) and re.match(r"\d+,\d+", ticket_data[unit_price_idx]) else ""
                total_price = ticket_data[total_price_idx] if total_price_idx < len(ticket_data) and re.match(r"\d+,\d+", ticket_data[total_price_idx]) else ""
                    
                # Append extracted data
                rows.append((product_quantity, product_name, unit_price, total_price, recipe_date))

    return rows

# Define UDF with return type as ArrayType of StructType (list of rows)
schema = StructType([
    StructField("product_quantity", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("unit_price", StringType(), True),
    StructField("total_price", StringType(), True),
    StructField("recipe_date", StringType(), True)
])

# Initialize an empty DataFrame with the defined schema
mercadona_products_df = spark.createDataFrame([], schema)

# Iterate through each row in the original DataFrame and apply the UDF
for row in df_mercadona.collect():
    extracted_data = extract_ticket_data(row['value'])
    print(extracted_data)
    if extracted_data:
        # Convert the extracted data to a DataFrame
        temp_df = spark.createDataFrame(extracted_data, schema)
        # Append the new rows to the new DataFrame
        mercadona_products_df = mercadona_products_df.union(temp_df)


mercadona_products_df.write.format("delta").mode("overwrite").saveAsTable("mercadona_products")

In [None]:
spark.sql("DROP TABLE mercadona_products")

In [None]:
spark.sql("SELECT * FROM mercadona_products").show(300)

In [None]:
from pyspark.sql import functions as F

bananas_df = spark.sql("SELECT * FROM mercadona_products where product_name ilike '%banana%'")

bananas_with_dates = bananas_df.withColumn("recipe_date_asdate", F.to_date("recipe_date", "dd/MM/yyyy")).drop("recipe_date")

bananas_with_dates.sort("recipe_date_asdate").show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql import functions as F

bananas_with_dates_cleaned = bananas_with_dates.withColumn(
    "total_price_cleaned", 
    F.regexp_replace("total_price", r"[^\d,\.]", "")  # Remove all non-numeric characters except ',' and '.'
)

# Convert the cleaned 'total_price' column to a numeric type
bananas_with_dates_cleaned = bananas_with_dates_cleaned.withColumn(
    "total_price_int", 
    F.when(
        F.trim(F.col("total_price_cleaned")) != "", 
        F.regexp_replace(F.col("total_price_cleaned"), ",", ".").cast("double")
    ).otherwise(None)
)

# Assuming 'bananas_with_dates' is your PySpark DataFrame
# Convert 'recipe_date_asdate' to timestamp (numeric format)
bananas_with_dates = bananas_with_dates_cleaned.withColumn("recipe_date_timestamp", F.unix_timestamp("recipe_date_asdate", "yyyy-MM-dd"))

bananas_with_dates.show()

# Convert to Pandas DataFrame for easy plotting
pandas_df = bananas_with_dates.select("recipe_date_asdate", "total_price_int").toPandas()

# Clean the 'total_price' column to ensure it's a numeric type
pandas_df["total_price"] = pd.to_numeric(pandas_df["total_price_int"], errors='coerce')  # Convert invalid values to NaN

print(pandas_df)

# Plotting the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(pandas_df["recipe_date_asdate"], pandas_df["total_price_int"], color='blue', label='Total Price')
plt.title("Scatter Plot of Total Price vs Recipe Date")
plt.xlabel("Recipe Date (Unix Timestamp)")
plt.ylabel("Total Price")
plt.grid(True)
plt.show()

In [None]:
spark.sql("SELECT * FROM mercadona_products where product_name ilike '%salmo%'").show(300)

In [None]:
spark.sql("SELECT * FROM mercadona_products where product_name ilike '%ANARCARDS%'").show(300)