<a href="https://colab.research.google.com/github/fmejias/CienciasDeLosDatosTEC/blob/master/BigData/Tareas/Tarea2/TP2_BigData_FelipeMejias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Big Data
# Trabajo práctico 2

- Professor: Luis Chavarría.

- Student:  
    - Felipe Alberto Mejías Loría, Instituto Tecnológico de Costa Rica. 

- December 05th, 2019

## **1-) Instalación de PySpark y Optimus**

In [0]:
# Install necessary libraries
!pip3 install pyspark
!pip install -q findspark
!pip install optimuspyspark

# Needed to install Spark in Google Colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz


# **2-) Actualizar variables de ambiente necesarias para correr Spark en Google Colab**

In [0]:
# Set necessary environmental variables to use Apache Spark in Google Colab
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# **3-) Importar bibliotecas necesarias para la ejecución de la TP2**

In [0]:
# Necessary Imports for the execution of the TP1
import pandas as pd
import numpy as np
import findspark
import json
from datetime import datetime
from pyspark.sql import SparkSession, Row, dataframe
from pyspark.sql.functions import col, date_format, udf, array
from pyspark.sql.types import DateType
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
from optimus import Optimus
from urllib.error import HTTPError
from google.colab import files

# Set SPARK_HOME. Needed to initialize Apache Spark.
findspark.init("spark-2.4.4-bin-hadoop2.7")

In [0]:
# JSON Files Path
CAJA1_JSON_PATH = "https://raw.githubusercontent.com/fmejias/CienciasDeLosDatosTEC/master/BigData/Tareas/Tarea2/compras_caja1.json"
CAJA2_JSON_PATH = "https://raw.githubusercontent.com/fmejias/CienciasDeLosDatosTEC/master/BigData/Tareas/Tarea2/compras_caja2.json"
CAJA3_JSON_PATH = "https://raw.githubusercontent.com/fmejias/CienciasDeLosDatosTEC/master/BigData/Tareas/Tarea2/compras_caja3.json"
CAJA4_JSON_PATH = "https://raw.githubusercontent.com/fmejias/CienciasDeLosDatosTEC/master/BigData/Tareas/Tarea2/compras_caja4.json"
CAJA5_JSON_PATH = "https://raw.githubusercontent.com/fmejias/CienciasDeLosDatosTEC/master/BigData/Tareas/Tarea2/compras_caja5.json"

def create_spark_session():
  """
  This function builds a Spark Session
  return the main entry of a Spark DataFrame
  """
  spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("Basic JDBC pipeline") \
    .getOrCreate()
  return spark

def show_complete_spark_data_frame(spark_data_frame):
  """
  This function shows the complete spark_data_frame
  """
  spark_data_frame.show(spark_data_frame.count(), False)

def get_column_values_to_list(data_frame, column_name):
  """
  This function returns the values of a column into a list
  data_frame: Spark DataFrame
  column_name: Column Name to get the values from
  """
  return data_frame.select(column_name).rdd.flatMap(lambda x: x).collect()

def print_formatted_dictionary(dictionary):
  """
  This function shows the content of a Python Dictionary
  dictionary: Python Dictionary from Pandas DataFrame
  """
  print(json.dumps(dictionary, indent = 4), "\n")

def load_json_files_to_dict():
  """
  This function loads JSON Files into Python Dictionaries
  return list with all dictionaries
  """
  # Load JSON Files in Pandas DataFrame and then traslated to dictionary
  caja1_dict = pd.read_json(CAJA1_JSON_PATH, orient='columns').to_dict('records')
  caja2_dict = pd.read_json(CAJA2_JSON_PATH, orient='columns').to_dict('records')
  caja3_dict = pd.read_json(CAJA3_JSON_PATH, orient='columns').to_dict('records')
  caja4_dict = pd.read_json(CAJA4_JSON_PATH, orient='columns').to_dict('records')
  caja5_dict = pd.read_json(CAJA5_JSON_PATH, orient='columns').to_dict('records')

  return [caja1_dict, caja2_dict, caja3_dict, caja4_dict, caja5_dict]

def create_spark_data_frame_sales_per_caja_row(total_sold, caja_id):
  """
  This function creates a Spark Data Frame Row
  total_sold: total amount sold per caja
  caja_id: Sale identifier
  return the Product Row
  """
  total_sold_row = Row("caja", "total_vendido")
  return total_sold_row(caja_id, total_sold)

def create_spark_data_frame_metric_row(metric_type, metric_value):
  """
  This function creates a Spark Data Frame Row
  metric_type: type of metric
  metric_value: value of metric
  return the Metric Row
  """
  metric_row = Row("tipo_metrica", "valor")
  return metric_row(metric_type, metric_value)

def create_spark_data_frame_product_row(product_dict, caja_id):
  """
  This function creates a Spark Data Frame Row
  product_dict: Python Dictionary with the information of a Product
  caja_id: Sale identifier
  return the Product Row
  """
  product_row = Row("caja", "nombre", "cantidad", "precio_unitario")
  return product_row(caja_id, product_dict["nombre"], product_dict["cantidad"],
                     product_dict["precio_unitario"])

def create_spark_data_frame_from_rows_list(rows_list, columns_name_list):
  """
  This function creates a Spark Data Frame from a list of rows
  rows_list: List with all rows for the new Data Frame
  columns_name_list: List with the names of the columns
  return the Spark Data Frame
  """
  spark = create_spark_session()
  spark_data_frame = spark.createDataFrame(rows_list, columns_name_list)
  return spark_data_frame

def set_total_sold_per_caja_spark_data_frame_row(sales_dict):
  """
  This function creates a row with Total Sold Per Caja
  sales_dict: Dictionary with all sales of a caja
  return the Row with Total Sold Per Caja
  """
  total_sold_amount = 0
  caja_id = 0
  for sale in sales_dict:
    if "compras" in sale and sale["compras"]:
      for product in sale["compras"]:
        total_sold_amount += int(product["cantidad"])*int(product["precio_unitario"])
        caja_id = sale["numero_caja"]
  return create_spark_data_frame_sales_per_caja_row(total_sold_amount, caja_id)

def create_list_with_product_spark_data_frame_rows(sales_dict):
  """
  This function creates a List with Product Rows
  sales_dict: Dictionary with all sales of a caja
  return the List with Product Rows
  """
  products_rows = []
  for sale in sales_dict:
    if "compras" in sale and sale["compras"]:
      for product in sale["compras"]:
        product_row = create_spark_data_frame_product_row(product,
                                                          sale["numero_caja"])
        products_rows.append(product_row)
  return products_rows

def create_spark_data_frame_from_dict(sales_dict):
  """
  This function creates a Spark Data Frame from a sales dictionary
  sales_dict: Dictionary with all sales of a caja
  return the Product Spark Data Frame
  """
  # Create products rows
  products_rows = create_list_with_product_spark_data_frame_rows(sales_dict)
  
  # Create Product Spark Data Frame
  products_df = create_spark_data_frame_from_rows_list(products_rows,
                                                       ['caja', 'nombre',
                                                        'cantidad',
                                                        'precio_unitario'])
  products_df = products_df.withColumn("cantidad", products_df["cantidad"].cast(IntegerType()))
  return products_df
  

def create_spark_data_frame_list_from_sales_list(sales_list):
  """
  This function creates a Spark Data Frame List from a list of sales dictionaries
  sales_list: List with all sales of a caja
  return the List with all Product Spark Data Frames
  """
  spark_df_list = []
  for sales_dict in sales_list:
    product_data_frame = create_spark_data_frame_from_dict(sales_dict)
    spark_df_list.append(product_data_frame)
  return spark_df_list

def create_spark_data_frame_from_total_sold_rows_list(sales_list):
  """
  This function creates a Spark Data Frame List from a list of sales dictionaries
  sales_list: List with all sales of a caja
  return the List with all Product Spark Data Frames
  """
  spark_df_total_sold_rows_list = []
  for sales_dict in sales_list:
    total_sold_row = set_total_sold_per_caja_spark_data_frame_row(sales_dict)
    spark_df_total_sold_rows_list.append(total_sold_row)
  
  # Create Total Sold Per Caja Spark Data Frame
  total_sold_df = create_spark_data_frame_from_rows_list(spark_df_total_sold_rows_list,
                                                        ['caja',
                                                        'total_vendido'])
  return total_sold_df

def create_union_products_data_frame(spark_products_df_list):
  """
  This function calculates the union of products Data Frames
  spark_products_df_list: Spark Data Frames List
  return the union of Product Data Frames
  """
  union_product_df = None
  while len(spark_products_df_list) >= 1:
    if union_product_df is None and len(spark_products_df_list) == 1:
      break
    elif union_product_df is None:
      union_product_df = spark_products_df_list[0].union(spark_products_df_list[1])
      spark_products_df_list.pop(1)
      spark_products_df_list.pop(0)
    else:
      union_product_df = union_product_df.union(spark_products_df_list[0])
      spark_products_df_list.pop(0)
  
  return union_product_df if union_product_df is not None else spark_products_df_list[0]


def set_total_products_data_frame(union_products_df):
  """
  This function calculates the total of sold products
  union_products_df: Spark Data Frame with all products
  return the Data Frame Order by Products and amount of sales of each product
  """
  group_by_name_df = union_products_df.groupBy("nombre").sum()
  group_by_name_df = group_by_name_df.select(col('nombre'),
                                             col('sum(cantidad)').alias('cantidad_vendida'))
  return group_by_name_df


def calculate_total_of_products(sales_list):
  """
  This function calculates the CSV file with the total of products
  sales_list: List with all sales of a caja
  return the CSV file
  """
  spark_products_df_list = create_spark_data_frame_list_from_sales_list(sales_list)
  union_products_df = create_union_products_data_frame(spark_products_df_list)
  total_products_df = set_total_products_data_frame(union_products_df)

  # Show Total Products Data Frame
  show_complete_spark_data_frame(total_products_df)

  # Generate CSV File
  return_output_csv_files(total_products_df, 'total_productos')

  return total_products_df

def calculate_sold_products_per_caja(sales_list):
  """
  This function calculates the CSV file with the sold products per caja
  sales_list: List with all sales of a caja
  return the CSV file
  """
  total_sold_products_df = create_spark_data_frame_from_total_sold_rows_list(sales_list)

  # Show Total Products Data Frame
  show_complete_spark_data_frame(total_sold_products_df)

  # Generate CSV File
  return_output_csv_files(total_sold_products_df, 'total_cajas')

  return total_sold_products_df

def get_product_with_more_sales(total_products_df):
  """
  This function returns the row with product with more sales metric
  total_products_df: DataFrame with total products
  return the Spark Row
  """
  # Order Sold Products
  order_total_products_df = total_products_df.orderBy(total_products_df.cantidad_vendida.desc())
  product_with_more_sales = order_total_products_df.head().nombre

  return create_spark_data_frame_metric_row("producto_mas_vendido_por_unidad",
                                            str(product_with_more_sales))

def get_caja_with_more_sales(total_sold_products_df):
  """
  This function returns the row with more sales metric
  total_sold_products_df: DataFrame with total sold products
  return the Spark Row
  """
  # Order Sold Products
  order_sold_products_df = total_sold_products_df.orderBy(total_sold_products_df.total_vendido.desc())
  caja_with_more_sales = order_sold_products_df.head().caja

  return create_spark_data_frame_metric_row("caja_con_mas_ventas", str(caja_with_more_sales))

def get_caja_with_less_sales(total_sold_products_df):
  """
  This function returns the row with less sales metric
  total_sold_products_df: DataFrame with total sold products
  return the Spark Row
  """
  # Order Sold Products
  order_sold_products_df = total_sold_products_df.orderBy(total_sold_products_df.total_vendido.asc())
  caja_with_less_sales = order_sold_products_df.head().caja

  return create_spark_data_frame_metric_row("caja_con_menos_ventas", str(caja_with_less_sales))

def get_percentile(total_sold_products_df, percentile_number=25):
  """
  This function returns the row with percentile metric
  total_sold_products_df: DataFrame with total sold products
  percentile_number: Percentile value to calculate
  return the Spark Row
  """
  # Order Sold Products
  order_sold_products_df = total_sold_products_df.orderBy(total_sold_products_df.total_vendido.asc())

  # Get total_vendido
  list_of_total_sold = get_column_values_to_list(order_sold_products_df,
                                                 'total_vendido')
  
  # Get Percentile Value
  percentile_value = int(np.percentile(np.array(list_of_total_sold), percentile_number))
  percentile_column_name = "percentil_{percentile_number}_por_caja".format(percentile_number = percentile_number)
  return create_spark_data_frame_metric_row(percentile_column_name, str(percentile_value))

def get_higher_income_product(sales_list):
  """
  This function returns the row with higher income product metric
  sales_list: List with all sales
  return the Spark Row
  """
  spark_products_df_list = create_spark_data_frame_list_from_sales_list(sales_list)
  union_products_df = create_union_products_data_frame(spark_products_df_list)
  
  # Multiply amount and quantity 
  products_order_by_name_df = union_products_df.orderBy("nombre")
  amount_times_price_op = products_order_by_name_df['cantidad']*products_order_by_name_df['precio_unitario']

  # Add Column with CantidadxPrecio and the select only Name and CantidadxPrecio
  products_with_income_df = products_order_by_name_df.withColumn('CantidadxPrecio',
                                                                 amount_times_price_op)
  products_select_income_and_name_df = products_with_income_df.select("nombre",
                                                                      "CantidadxPrecio")
  
  # Group products income by name and the order by desc
  products_group_by_income_df = products_select_income_and_name_df.groupBy("nombre").sum()
  products_group_by_amount_sum_df = products_group_by_income_df.select(col('nombre'),
                                                                       col('sum(CantidadxPrecio)').alias('ingresos_generados'))
  products_group_by_amount_sum_df = products_group_by_amount_sum_df.orderBy(products_group_by_amount_sum_df.ingresos_generados.desc())

  # Get higher income product and create metric row
  higher_income_product = products_group_by_amount_sum_df.head().nombre
  return create_spark_data_frame_metric_row("producto_de_mayor_ingreso", str(higher_income_product))

  

def calculate_sales_metrics(sales_dicts_list, total_products_df,
                            total_sold_products_df):
  """
  This function calculates the CSV file with the sales metrics
  total_products_df: DataFrame with total products 
  total_sold_products_df: DataFrame with total sold products
  return the CSV file
  """
  caja_with_more_sales_row = get_caja_with_more_sales(total_sold_products_df)
  caja_with_less_sales_row = get_caja_with_less_sales(total_sold_products_df)
  percentile_25_row = get_percentile(total_sold_products_df, percentile_number=25)
  percentile_50_row = get_percentile(total_sold_products_df, percentile_number=50)
  percentile_75_row = get_percentile(total_sold_products_df, percentile_number=75)
  product_with_more_sales_row = get_product_with_more_sales(total_products_df)
  higher_income_product_row = get_higher_income_product(sales_dicts_list)

  metrics_df = create_spark_data_frame_from_rows_list([caja_with_more_sales_row,
                                                       caja_with_less_sales_row,
                                                       percentile_25_row,
                                                       percentile_50_row,
                                                       percentile_75_row,
                                                       product_with_more_sales_row,
                                                       higher_income_product_row],
                                                      ['tipo_metrica',
                                                       'valor'])
  # Show Total Products Data Frame
  show_complete_spark_data_frame(metrics_df)

  # Generate CSV File
  return_output_csv_files(metrics_df, 'metricas')

def return_output_csv_files(spark_data_frame, filename):
  """
  This function creates the CSV file from a Spark Data Frame
  return the CSV file
  """
  from google.colab import files
  pandas_df = spark_data_frame.select("*").toPandas()
  pandas_df.to_csv('{filename}.csv'.format(filename=filename), index=False) 
  files.download('{filename}.csv'.format(filename=filename))


# **5-) Funciones principales del programa y función main() para ejecutar el programa principal**

In [13]:
def main():
  """
  This function calculates the products in a Supermarket
  """
  # Create Python Dictionaries from JSON Files
  sales_dicts_list = load_json_files_to_dict()

  # Calculate Total of Products
  print("\nEl total de productos vendidos en todas las cajas es: \n")
  total_products_df = calculate_total_of_products(sales_dicts_list)

  # Calculate Total of Cajas
  print("\nEl total de productos vendidos por caja es: \n")
  total_sold_products_df = calculate_sold_products_per_caja(sales_dicts_list)

  # Calculate Metrics
  print("\nLas estadísticas de las ventas son las siguientes: \n")
  calculate_sales_metrics(sales_dicts_list, total_products_df,
                          total_sold_products_df)

# Execute main program
main()


El total de productos vendidos en todas las cajas es: 

+-------------------+----------------+
|nombre             |cantidad_vendida|
+-------------------+----------------+
|Helado Cero Grados |15              |
|FrescoLeche        |7               |
|Pepino             |7               |
|durazno            |16              |
|Chocoleta          |18              |
|brocoli            |10              |
|Cebolla            |8               |
|Leche              |13              |
|Cremoleta          |18              |
|Pollo adobado      |5               |
|Azucar             |3               |
|Salchichon         |3               |
|Prestobarba        |4               |
|Pringles           |10              |
|Papel higienico    |2               |
|Chocolate          |12              |
|Mantequilla de Maní|4               |
|Mejitos            |10              |
|fresas             |37              |
|Pan Bimbo          |4               |
|papas              |42              |
|Cepill

# **6-) Pruebas Unitarias con Pytest**

**6.1) Instalar Pytest en Google Colab**

In [0]:
!pip install ipytest
!pip install pytest

**6.2) Importar Pytest y los comandos llamados magics para lograr correr Pytest en Google Colab**

In [0]:
import ipytest.magics
import pytest
import sys
from pytest import fixture,mark

# This is needed in order to fix the __file__ issue that Google Colab throws
__file__ = sys.argv[0]

**6.3) Datos utilitarios para las pruebas unitarias**

In [0]:
import pandas as pd

# Create Spark Session to convert Pandas Dataframe to Spark Dataframe
spark = create_spark_session()

# Expected Products per Sale
expected_products_per_sale = [
    {'caja': "1",
     'producto': {"nombre": "Mejitos", "cantidad": "1", "precio_unitario": "100"}},
    {'caja': "2",
     'producto': {"nombre": "Chocoleta", "cantidad": "2", "precio_unitario": "150"}}
]

# Expected Products Sales Per Caja
expected_products_sales_per_caja = [
{"total_productos": 5,
 "compras": [
{
    "numero_caja": "1",
    "compras": [
                {
                    "nombre": "manzana",
                    "cantidad": "3",
                    "precio_unitario": "22"
                },
                {
                    "nombre": "brocoli",
                    "cantidad": "2",
                    "precio_unitario": "33"
                }
    ]
},
{
    "numero_caja": "1",
    "compras": [
                {
                    "nombre": "Chocoleta",
                    "cantidad": "3",
                    "precio_unitario": "22"
                },
                {
                    "nombre": "Cremoleta",
                    "cantidad": "2",
                    "precio_unitario": "33"
                },
                {
                    "nombre": "Chocolate",
                    "cantidad": "4",
                    "precio_unitario": "3300"
                }
    ]
}]},
{"total_productos": 5,
 "compras": [
 {
    "numero_caja": "2",
    "compras": [
                {
                    "nombre": "manzana",
                    "cantidad": "3",
                    "precio_unitario": "22"
                },
                {
                    "nombre": "brocoli",
                    "cantidad": "2",
                    "precio_unitario": "33"
                }
    ]
},
{
    "numero_caja": "2",
    "compras": [
                {
                    "nombre": "manzana",
                    "cantidad": "3",
                    "precio_unitario": "22"
                },
                {
                    "nombre": "brocoli",
                    "cantidad": "2",
                    "precio_unitario": "33"
                },
                {
                    "nombre": "pavo",
                    "cantidad": "3",
                    "precio_unitario": "3300"
                }
    ]
}
]}]

# Expected Products DataFrame
expected_products_df_dict = {
    'caja': ["1", "1", "1", "1", "1"],
    'nombre': ["manzana", "brocoli",
               "Chocoleta", "Cremoleta",
               "Chocolate"],
    'cantidad': ["3", "2", "3", "2", "4"],
    'precio_unitario': ["22", "33", "22", "33", "3300"]
}

# Expected Final Total Products DataFrame
expected_total_products_df_dict = {
    'nombre': ["manzana", "brocoli",
               "Chocoleta", "Cremoleta",
               "Chocolate"],
    'cantidad_vendida': ["3", "2", "3", "2", "4"]
}

# Expected Final Total Sold Products DataFrame
expected_total_sold_products_df_dict = {
    'caja': ['1'],
    'total_vendido': ["13464"]
}

# Expected Total Sold Products
expected_total_sold_products = [
    {'caja': 1,
     'total_vendido': 15000},
    {'caja': 2,
     'total_vendido': 17000}
]

# Expected Sales Metrics
expected_sales_metrics = [
    {'tipo_metrica': 'caja_con_mas_ventas',
     'valor': '1'},
    {'tipo_metrica': 'caja_con_menos_ventas',
     'valor': '5'},
    {'tipo_metrica': 'percentil_25_por_caja',
     'valor': '1500'},
    {'tipo_metrica': 'percentil_50_por_caja',
     'valor': '2500'},
    {'tipo_metrica': 'percentil_75_por_caja',
     'valor': '3500'},
    {'tipo_metrica': 'producto_mas_vendido_por_unidad',
     'valor': 'pavo'},
    {'tipo_metrica': 'producto_de_mayor_ingreso',
     'valor': 'apio'}
]

def convert_from_dict_to_spark(expected_dict):
  pandas_df = pd.DataFrame.from_dict(expected_dict)
  return spark.createDataFrame(pandas_df)

**6.4) Pruebas unitarias para obtener el total de productos**

In [17]:
# This command is needed to run the UTs in Google Colab
%%run_pytest[clean] -s

@fixture(scope="module")
def total_products_functionality_fixture():
    # Convert Pandas DataFrame to Spark DataFrame
    global expected_products_spark_df
    expected_products_spark_df = convert_from_dict_to_spark(expected_products_df_dict)
    assert expected_products_spark_df is not None, \
          'Error when created the expected DataFrame'
    
    global expected_total_products_spark_df
    expected_total_products_spark_df = convert_from_dict_to_spark(expected_total_products_df_dict)
    assert expected_total_products_spark_df is not None, \
          'Error when created the expected DataFrame'

def test_create_succesful_spark_session():
    assert create_spark_session() is not None

@mark.parametrize('product_per_sale', expected_products_per_sale)
def test_create_spark_data_frame_product_row(product_per_sale):
    product_spark_row = create_spark_data_frame_product_row(product_per_sale["producto"],
                                                            product_per_sale["caja"])
    assert product_spark_row is not None
    assert isinstance(product_spark_row, Row)
    assert ["caja", "nombre", "cantidad", "precio_unitario"] == list(product_spark_row.asDict())
    assert product_per_sale["caja"] == product_spark_row["caja"]
    assert product_per_sale["producto"]["nombre"] == product_spark_row["nombre"]
    assert product_per_sale["producto"]["cantidad"] == product_spark_row["cantidad"]
    assert product_per_sale["producto"]["precio_unitario"] == product_spark_row["precio_unitario"]

@mark.parametrize('sale_per_caja', expected_products_sales_per_caja)
def test_create_list_with_product_spark_data_frame_rows(sale_per_caja):
  products_rows_list = create_list_with_product_spark_data_frame_rows(sale_per_caja["compras"])
  assert len(products_rows_list) == sale_per_caja["total_productos"]

def test_create_spark_data_frame_from_rows_list(total_products_functionality_fixture):
  sales_to_test = expected_products_sales_per_caja[0]["compras"]
  products_rows_list = create_list_with_product_spark_data_frame_rows(sales_to_test)
  products_df = create_spark_data_frame_from_rows_list(products_rows_list,
                                                       ['caja', 'nombre',
                                                        'cantidad',
                                                        'precio_unitario'])
  dataframes_difference = products_df.exceptAll(expected_products_spark_df)

  # Check the resulting DataFrame from the difference has no rows as there
  # is no difference between DataFrames
  assert dataframes_difference.count() == 0

def test_create_spark_data_frame_from_dict(total_products_functionality_fixture):
  sales_to_test = expected_products_sales_per_caja[0]["compras"]
  products_df = create_spark_data_frame_from_dict(sales_to_test)

  dataframes_difference = products_df.exceptAll(expected_products_spark_df)

  # Check the resulting DataFrame from the difference has no rows as there
  # is no difference between DataFrames
  assert dataframes_difference.count() == 0


def test_create_spark_data_frame_list_from_sales_list(total_products_functionality_fixture):
  sales_list_to_test = [list(expected_products_sales_per_caja[0]["compras"])]
  products_df_list = create_spark_data_frame_list_from_sales_list(sales_list_to_test)
  assert len(products_df_list) == 1
  
  # Get Product DataFrame from list
  products_df = products_df_list[0]
  dataframes_difference = products_df.exceptAll(expected_products_spark_df)

  # Check the resulting DataFrame from the difference has no rows as there
  # is no difference between DataFrames
  assert dataframes_difference.count() == 0

def test_calculate_total_of_products(total_products_functionality_fixture):
  sales_list_to_test = [list(expected_products_sales_per_caja[0]["compras"])]
  total_products_df = calculate_total_of_products(sales_list_to_test)

  # Get Product DataFrame from list
  dataframes_difference = total_products_df.exceptAll(expected_total_products_spark_df)

  # Check the resulting DataFrame from the difference has no rows as there
  # is no difference between DataFrames
  assert dataframes_difference.count() == 0


platform linux -- Python 3.6.8, pytest-3.6.4, py-1.8.0, pluggy-0.7.1
rootdir: /usr/local, inifile: setup.cfg
collected 9 items

../usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py ........+---------+----------------+
|nombre   |cantidad_vendida|
+---------+----------------+
|Chocoleta|3               |
|brocoli  |2               |
|Cremoleta|2               |
|Chocolate|4               |
|manzana  |3               |
+---------+----------------+

.



**6.5) Pruebas unitarias para obtener el total de las cajas**



In [18]:
# This command is needed to run the UTs in Google Colab
%%run_pytest[clean] -s

@fixture(scope="module")
def total_sold_products_functionality_fixture():
    # Convert Pandas DataFrame to Spark DataFrame
    global expected_total_sold_products_spark_df
    expected_total_sold_products_spark_df = convert_from_dict_to_spark(expected_total_sold_products_df_dict)
    assert expected_total_sold_products_spark_df is not None, \
          'Error when created the expected DataFrame'

@mark.parametrize('total_sold_per_caja', expected_total_sold_products)
def test_create_spark_data_frame_sales_per_caja_row(total_sold_per_caja):
    total_sold_spark_row = create_spark_data_frame_sales_per_caja_row(total_sold_per_caja["total_vendido"],
                                                                      total_sold_per_caja["caja"])
    assert total_sold_spark_row is not None
    assert isinstance(total_sold_spark_row, Row)
    assert ["caja", "total_vendido"] == list(total_sold_spark_row.asDict())
    assert total_sold_per_caja["caja"] == total_sold_spark_row["caja"]
    assert total_sold_per_caja["total_vendido"] == total_sold_spark_row["total_vendido"]

def test_set_total_sold_per_caja_spark_data_frame_row():
  sales_to_test = expected_products_sales_per_caja[0]["compras"]
  total_sold_spark_row = set_total_sold_per_caja_spark_data_frame_row(sales_to_test)
  expected_caja = 1

  assert total_sold_spark_row is not None
  assert isinstance(total_sold_spark_row, Row)
  assert ["caja", "total_vendido"] == list(total_sold_spark_row.asDict())
  assert str(expected_caja) == total_sold_spark_row["caja"]

def test_create_spark_data_frame_from_total_sold_rows_list(total_sold_products_functionality_fixture):
  sales_list_to_test = [list(expected_products_sales_per_caja[0]["compras"])]
  total_sold_df = create_spark_data_frame_from_total_sold_rows_list(sales_list_to_test)
  
  # Get Total Sold Product DataFrame from list
  dataframes_difference = total_sold_df.exceptAll(expected_total_sold_products_spark_df)

  # Check the resulting DataFrame from the difference has no rows as there
  # is no difference between DataFrames
  assert dataframes_difference.count() == 0

def test_calculate_sold_products_per_caja(total_sold_products_functionality_fixture):
  sales_list_to_test = [list(expected_products_sales_per_caja[0]["compras"])]
  total_sold_df = calculate_sold_products_per_caja(sales_list_to_test)

  # Get Total Sold Product DataFrame from list
  dataframes_difference = total_sold_df.exceptAll(expected_total_sold_products_spark_df)

  # Check the resulting DataFrame from the difference has no rows as there
  # is no difference between DataFrames
  assert dataframes_difference.count() == 0


platform linux -- Python 3.6.8, pytest-3.6.4, py-1.8.0, pluggy-0.7.1
rootdir: /usr/local, inifile: setup.cfg
collected 5 items

../usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py ....+----+-------------+
|caja|total_vendido|
+----+-------------+
|1   |13464        |
+----+-------------+

.



**6.6) Pruebas unitarias para obtener las métricas**

In [19]:
# This command is needed to run the UTs in Google Colab
%%run_pytest[clean] -s

@fixture(scope="module")
def metrics_functionality_fixture():
    # Convert Pandas DataFrame to Spark DataFrame
    global total_products_spark_df
    total_products_spark_df = convert_from_dict_to_spark(expected_total_products_df_dict)
    assert total_products_spark_df is not None, \
          'Error when created the expected DataFrame'
    
    global total_sold_products_spark_df
    total_sold_products_spark_df = convert_from_dict_to_spark(expected_total_sold_products_df_dict)
    assert total_sold_products_spark_df is not None, \
          'Error when created the expected DataFrame'

@mark.parametrize('sales_metric', expected_sales_metrics)
def test_create_spark_data_frame_metric_row(sales_metric):
    sales_metric_spark_row = create_spark_data_frame_metric_row(sales_metric["tipo_metrica"],
                                                                sales_metric["valor"])
    assert sales_metric_spark_row is not None
    assert isinstance(sales_metric_spark_row, Row)
    assert ["tipo_metrica", "valor"] == list(sales_metric_spark_row.asDict())
    assert sales_metric["tipo_metrica"] == sales_metric_spark_row["tipo_metrica"]
    assert sales_metric["valor"] == sales_metric_spark_row["valor"]

def test_get_caja_with_more_sales(metrics_functionality_fixture):
  more_sales_metric_row = get_caja_with_more_sales(total_sold_products_spark_df)
  assert more_sales_metric_row is not None
  assert isinstance(more_sales_metric_row, Row)
  assert ["tipo_metrica", "valor"] == list(more_sales_metric_row.asDict())
  assert "caja_con_mas_ventas" == more_sales_metric_row["tipo_metrica"]
  assert "1" == more_sales_metric_row["valor"]

def test_get_caja_with_less_sales(metrics_functionality_fixture):
  less_sales_metric_row = get_caja_with_less_sales(total_sold_products_spark_df)
  assert less_sales_metric_row is not None
  assert isinstance(less_sales_metric_row, Row)
  assert ["tipo_metrica", "valor"] == list(less_sales_metric_row.asDict())
  assert "caja_con_menos_ventas" == less_sales_metric_row["tipo_metrica"]
  assert "1" == less_sales_metric_row["valor"]

def test_get_product_with_more_sales(metrics_functionality_fixture):
  product_with_more_sales_metric_row = get_product_with_more_sales(total_products_spark_df)
  assert product_with_more_sales_metric_row is not None
  assert isinstance(product_with_more_sales_metric_row, Row)
  assert ["tipo_metrica", "valor"] == list(product_with_more_sales_metric_row.asDict())
  assert "producto_mas_vendido_por_unidad" == product_with_more_sales_metric_row["tipo_metrica"]
  assert "Chocolate" == product_with_more_sales_metric_row["valor"]

def test_get_higher_income_product():
  sales_list_to_test = [list(expected_products_sales_per_caja[0]["compras"])]
  higher_income_product_metric_row = get_higher_income_product(sales_list_to_test)
  assert higher_income_product_metric_row is not None
  assert isinstance(higher_income_product_metric_row, Row)
  assert ["tipo_metrica", "valor"] == list(higher_income_product_metric_row.asDict())
  assert "producto_de_mayor_ingreso" == higher_income_product_metric_row["tipo_metrica"]

platform linux -- Python 3.6.8, pytest-3.6.4, py-1.8.0, pluggy-0.7.1
rootdir: /usr/local, inifile: setup.cfg
collected 11 items

../usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py ...........

