In [0]:
# Install necessary libraries

!pip3 install pyspark
!pip3 install findspark

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"



In [0]:
import pandas as pd
import findspark
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME

from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, udf 
from pyspark.sql.types import DateType

# Generate random transactions from a dictionary
def generate_transactions():
  # Dictionary with all customer data information
  customer_data = {
      'customer_id' : [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
      'amount' : [55, 125, 32, 64, 128, 333, 334, 333, 11, 44],
      'purchased_at' : ['2017-03-01 09:00:00',
                        '2017-03-01 10:00:00',
                        '2017-03-02 13:00:00',
                        '2017-03-02 15:00:00',
                        '2017-03-03 10:00:00',
                        '2017-03-01 09:00:00',
                        '2017-03-01 09:01:00',
                        '2017-03-01 09:02:00',
                        '2017-03-03 20:00:00',
                        '2017-03-03 20:15:00']
  }

  # Create dataframe from previous dictionary
  data_frame = pd.DataFrame.from_dict(customer_data)
  return data_frame

# Create Spark DataFrame from Pandas DataFrame
def create_spark_data_frame(pandas_data_frame):
  spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("Basic JDBC pipeline") \
    .getOrCreate()
  
  # Reading single DataFrame in Spark by retrieving all rows from a DB table.
  df = spark.createDataFrame(pandas_data_frame)
  df.show()
  return df

def create_formatted_spark_data_frame(spark_data_frame):
  formatted_df = spark_data_frame.withColumn("date_string", date_format(col("purchased_at"), 'MM/dd/yyyy'))
  formatted_df.show()
  return formatted_df

def create_typed_spark_data_frame(formatted_spark_data_frame):
  string_to_date = \
    udf(lambda text_date: datetime.strptime(text_date, '%m/%d/%Y'),
        DateType())

  typed_df = formatted_spark_data_frame.withColumn("date", string_to_date(formatted_spark_data_frame.date_string))
  typed_df.show()
  typed_df.printSchema()
  return typed_df

def create_sums_spark_data_frame(typed_spark_data_frame):
  sum_df = typed_spark_data_frame.groupBy("customer_id", "date").sum()
  sum_df.show()
  return sum_df

def create_stats_spark_data_frame(sum_spark_data_frame):
  stats_df = \
    sum_spark_data_frame.select(
        col('customer_id'),
        col('date'),
        col('sum(amount)').alias('amount'))

  stats_df.printSchema()
  stats_df.show()
  return stats_df

def create_names_spark_data_frame():
  from pyspark.sql.types import IntegerType, StringType, StructField, StructType
  names_df = spark \
      .read \
      .format("csv") \
      .option("path", "names.csv") \
      .option("header", True) \
      .schema(StructType([
                  StructField("id", IntegerType()),
                  StructField("name", StringType()),
                  StructField("currency", StringType())])) \
      .load()

  names_df.printSchema()
  names_df.show()
  return names_df

def create_joint_spark_data_frame(names_spark_data_frame, stats_spark_data_frame):
  joint_df = stats_df.join(names_spark_data_frame, stats_spark_data_frame.customer_id == names_spark_data_frame.id)
  joint_df.printSchema()
  joint_df.show()
  return joint_df

# Create a dataframe of transactions
data_frame = generate_transactions()
print("Pandas Data Frame: \n", data_frame)

# Create a Spark DataFrame
print("\nSpark Data Frame: \n")
spark_data_frame = create_spark_data_frame(data_frame)

# Formatted Spark DataFrame
print("\nFormatted Spark Data Frame: \n")
formatted_spark_data_frame = create_formatted_spark_data_frame(spark_data_frame)

# Create Non Standard Spark Function
print("\nTyped Spark Data Frame: \n")
typed_spark_data_frame = create_typed_spark_data_frame(formatted_spark_data_frame)

# Sum Clients Purchases
print("\nSum Client Purchases: \n")
sum_spark_data_frame = create_sums_spark_data_frame(typed_spark_data_frame)

# Create Stats Spark DataFrame
print("\nStats Spark Data Frame: \n")
stats_spark_data_frame = create_stats_spark_data_frame(sum_spark_data_frame)

Pandas Data Frame: 
    customer_id  amount         purchased_at
0            1      55  2017-03-01 09:00:00
1            1     125  2017-03-01 10:00:00
2            1      32  2017-03-02 13:00:00
3            1      64  2017-03-02 15:00:00
4            1     128  2017-03-03 10:00:00
5            2     333  2017-03-01 09:00:00
6            2     334  2017-03-01 09:01:00
7            2     333  2017-03-01 09:02:00
8            2      11  2017-03-03 20:00:00
9            2      44  2017-03-03 20:15:00

Spark Data Frame: 

+-----------+------+-------------------+
|customer_id|amount|       purchased_at|
+-----------+------+-------------------+
|          1|    55|2017-03-01 09:00:00|
|          1|   125|2017-03-01 10:00:00|
|          1|    32|2017-03-02 13:00:00|
|          1|    64|2017-03-02 15:00:00|
|          1|   128|2017-03-03 10:00:00|
|          2|   333|2017-03-01 09:00:00|
|          2|   334|2017-03-01 09:01:00|
|          2|   333|2017-03-01 09:02:00|
|          2|    11|201