## Using RDD

In [None]:
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf = SparkConf().setMaster('local').setAppName('Customer'))

In [None]:
lines = sc.textFile('file:////Users/giovanna/Documents/GitHub/pyspark/SparkCourse/customer-orders.csv')

In [None]:
lines.take(2)

In [None]:
def parse (line):
    fields = line.split(',')
    return (fields[0], float(fields[2]))
rdd = lines.map(parse)

In [None]:
rdd.take(2)

In [None]:
total_spent = rdd.reduceByKey(lambda x,y: x[1]+y[1])

In [None]:
total_spent.take(2)

In [None]:
for cli in total_spent.collect():
    print (str(cli[0]) + ' R$' + str(cli[1]))

In [None]:
total_spent.take(2)

----

## Using DF

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('CustomerDF').getOrCreate()

In [6]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
schema = StructType([StructField("id", IntegerType(), True),\
                     StructField('id_prod', IntegerType(), True),\
                     StructField('total', FloatType(), True)])
sdf = spark.read.schema(schema).csv('file:////Users/giovanna/Documents/GitHub/pyspark/SparkCourse/customer-orders.csv')

In [7]:
sdf.show(2)

+---+-------+-----+
| id|id_prod|total|
+---+-------+-----+
| 44|   8602|37.19|
| 35|   5368|65.89|
+---+-------+-----+
only showing top 2 rows



In [10]:
total_spent = sdf.select('id', 'total').groupby('id').sum('total')
total_spent.show(2)

+---+-----------------+
| id|       sum(total)|
+---+-----------------+
| 31|4765.050008416176|
| 85| 5503.42998456955|
+---+-----------------+
only showing top 2 rows



In [15]:
import pyspark.sql.functions as F
total_spent = sdf.select('id', 'total')\
                    .groupby('id')\
                    .agg(F.round(F.sum('total'),2).alias('total_spent'))\
                    .sort("total_spent", ascending=False)
total_spent.show(2)

+---+-----------+
| id|total_spent|
+---+-----------+
| 68|    6375.45|
| 73|     6206.2|
+---+-----------+
only showing top 2 rows



In [16]:
spark.stop()