<a href="https://colab.research.google.com/github/earo12/ETL_PepsiCo/blob/dev_barnch/ETL_PepsiCo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# First we need to import some necessary libraries
# Since the PDF doesn't mention what kind of source we're going to need, I'm going to create them with PySpark
# It can be done with other data tools such as Pandas, or a SQL database and even a NoSQL database such as Mongo
# However, I think that the standard way to manipulate data is with Spark so I'm going to use it in this project in particular
# I'm writing this code in Colab, but actually, it can be run in practically every enviroment that incorporates Spark
# If your environment doesn't have installed Spark, please make sure to run this line first:
!pip install pyspark



In [17]:
# Now we can import the necessary libraries:

from pyspark.sql import SparkSession, Row # Session from Spark, it endes up when we close this notebook
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType # Fields recquired for the three tables needed in the challenge

import random # Libraries for insert the data
from datetime import datetime, timedelta

In [4]:
# We can create the Spark Session as sp:

sp = SparkSession.builder.master("local[*]").appName("PepsiCoCase").getOrCreate()


In [9]:
# Now we can define the schemas for the three tables
# The first one is related for inventory, and it has four fields: location, product, date and the information of the quantity of inventory
# To create a Spark dataframe we can use the StructField and StructType methods that we already imported two lines before:

inventory_table = StructType([StructField('location', StringType(), False),
                             StructField('product', StringType(), False),
                             StructField('date', DateType(), False),
                             StructField('quantity_inventory', IntegerType(), False)])

# Parameters of StructField are the field name, the kind of data, and if it can be null or not, all of them are in a list within StrucType

In [10]:
# We can repeat the same process for the other two tables
# The second one is a demand table with five fields location, product, date (which has the information from 14 days ago), the quantity of demand and the snapshot
# The snapshot is the date recquired for reporting

demand_table = StructType([StructField('location', StringType(), False),
                             StructField('product', StringType(), False),
                             StructField('date', DateType(), False),
                             StructField('quantity_demand', IntegerType(), False),
                             StructField('snapshot', DateType(), False)])

In [11]:
# Finally, we can follow the same thing for the replenishment table
# In particular, this one has four fields location, product, date and the information of the replenishment:

replenishment_table = StructType([StructField('location', StringType(), False),
                             StructField('product', StringType(), False),
                             StructField('date', DateType(), False),
                             StructField('quantity_replenishment', IntegerType(), False)])

In [15]:
# Now, we need to insert the data, in the PDF it doesn't make any mention of how
# First let's define the lists for every dataframe, I'm supposing three different locations and three different products
# However it can be made with practically all the locations and products you want:

locations = ["Location1", "Location2", "Location3"]
products = ["ProductA", "ProductB", "ProductC"]
today = datetime.today()


# Also I define the empty lists in which the data is going to be inserted:

inventory_data = []
demand_data = []
replenishment_data = []

In [18]:
# The first table of inventory has data available from one year to today, so to insert the data
# We can use a triple for to achieve it:

for days_in_year in range(365):
  today_date = today - timedelta(days = days_in_year)
  for location in locations:
    for product in products:
      inventory_data.append(Row(location = location,
                                product = product,
                                date = today_date.strftime('%Y-%m-%d'),
                                inventory_quantity = random.randint(0, 1000))) #This line means that the quantity of inventory can be from 0 to 1000


In [23]:
# Now we can make the same process for the other two tables:

# For the demand table in the 60 days before (since it is recquired for the dashboard):

for snap_date in range(60):  # Last 60 days for the snapshots
    snap_dt = today - timedelta(days=snap_date)
    for days_ahead in range(14):  # Projections for the following 14 days
        project_dt = snap_dt + timedelta(days = days_ahead)
        for location in locations:
            for product in products:
                demand_data.append(Row(
                    location = location,
                    product = product,
                    date = project_dt.strftime("%Y-%m-%d"),
                    demand_quantity = random.randint(0, 1000),
                    snapshot_date = snap_dt.strftime("%Y-%m-%d") # Here I put the values for the snapshot_date
                ))

In [29]:
# Finally we can make the same process with the final table which has data available from 3 years:

for days_three_years in range(3 * 365):  # 3 years = 3 * 365
    dt = today - timedelta(days = days_three_years)
    for location in locations:
        for product in products:
            replenishment_data.append(Row(
                location = location,
                product = product,
                date = dt.strftime("%Y-%m-%d"),
                replenishment_quantity=random.randint(0, 1000)
            ))


In [30]:
# We can create the dataframes in Spark:

inventory_df = sp.createDataFrame(inventory_data)
demand_df = sp.createDataFrame(demand_data)
replenishment_df = sp.createDataFrame(replenishment_data)

In [33]:
# To validate the information is done as we expected, we can use the show method
# IMPORTANT, please remember that Spark has a lazy evaluation
# So please make sure to only use this method only if necessary, to keep Spark as lazy as possible

# inventory_df.show()
# demand_df.show()
# replenishment_df.show()

+---------+--------+----------+----------------------+
| location| product|      date|replenishment_quantity|
+---------+--------+----------+----------------------+
|Location1|ProductA|2024-12-19|                   183|
|Location1|ProductB|2024-12-19|                    92|
|Location1|ProductC|2024-12-19|                   347|
|Location2|ProductA|2024-12-19|                   571|
|Location2|ProductB|2024-12-19|                   345|
|Location2|ProductC|2024-12-19|                   929|
|Location3|ProductA|2024-12-19|                   229|
|Location3|ProductB|2024-12-19|                   867|
|Location3|ProductC|2024-12-19|                   585|
|Location1|ProductA|2024-12-18|                     7|
|Location1|ProductB|2024-12-18|                   853|
|Location1|ProductC|2024-12-18|                   935|
|Location2|ProductA|2024-12-18|                   409|
|Location2|ProductB|2024-12-18|                   493|
|Location2|ProductC|2024-12-18|                   314|
|Location3