# Experimental simple performance testing notebook for Spark
- testing and comparing simple dataframe / sql operations of commong data (pre-)processing tasks 
- various available single-machine Python solutions are to be tested: Pandas, PySpark, Turi Create and Dask.
- execution times, CPU load and maximal memory use should be tracked


## Kiva dataset 
- [Kiva](https://www.kaggle.com/gaborfodor/additional-kiva-snapshot): crowdfunding data with lenders and loans, with additional geographic data
- download the related CSV files and move them to a folder where the kernel can read them


## init spark session

In [1]:
import findspark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split
import timeit 

findspark.init()

# use SparkSession instead of SparkContext: 
# setting SparkSession config paramters are necesary to use available memory (we can also limit CPUs by eg. 
# .config('spark.default.parallelism', 5), but it uses all the CPUs by default)
spark = SparkSession.builder \
    .appName('PySpark local test') \
    .config("spark.core.connection.ack.wait.timeout", "12000") \
    .config("spark.driver.maxResultSize", "4g") \
    .config('spark.executor.memory', '4G') \
    .config('spark.driver.memory', '5G') \
    .getOrCreate()

## read files to dataframes: loans and lenders

In [2]:
full_start = timeit.default_timer()

lenders_df = spark.read.csv("../../kiva/lenders.csv", header=True)  # 130 MB file, 2.349.174 lines
lenders_df.createOrReplaceTempView("lenders") 
loans_df = spark.read.csv("../../kiva/loans.csv", header=True)      # 2.1 GB file, 1.419.607 lines
loans_df.createOrReplaceTempView("loans") 

## read, transform and count loan_lenders 
string enumeration to rows: split tuple strings to array, then explode the array to rows

In [3]:
lldf = spark.read.csv("../../kiva/loans_lenders.csv", header=True) # .limit(20) 

loans_lenders_df = lldf.select( \
      lldf.loan_id, explode(split(lldf.lenders, ', ?')).alias('lender') \
).distinct() 

loans_lenders_df.createOrReplaceTempView("loans_lenders") 

## join, filter and sort loan and lender data
get distinct joined lines with renamed columns, then write to an output file (for fully materialized results)
- filtering on lenders.country_code: 
  - 'US': 25% of lenders
  - 'CA': 3% of lenders --> 3.5 GB joined, 1.971.548 lines

In [4]:
# join and filter using SQL: 
joined_df = spark.sql("""
select distinct 
  le.permanent_name as lender_permanent_name, le.display_name as lender_display_name, 
  le.city as lender_city, le.state as lender_state, le.country_code as lender_country_code, 
  le.member_since as lender_member_since, le.occupation as lender_occupation, 
  le.loan_because as lender_loan_because, le.loan_purchase_num as lender_loan_purchase_num, 
  le.invited_by as lender_invited_by, le.num_invited as lender_num_invited, 
  lo.loan_id, lo.loan_name, lo.original_language, lo.description, lo.description_translated, 
  lo.funded_amount, lo.loan_amount, lo.status, lo.activity_name, lo.sector_name, 
  lo.loan_use, lo.country_code, lo.country_name, lo.town_name, lo.currency_policy, 
  lo.currency_exchange_coverage_rate, lo.currency, lo.partner_id, lo.posted_time, 
  lo.planned_expiration_time, lo.disburse_time, lo.raised_time, lo.lender_term, 
  lo.num_lenders_total, lo.num_journal_entries, lo.num_bulk_entries, lo.tags, 
  lo.borrower_genders, lo.borrower_pictured, lo.repayment_interval, lo.distribution_model
from   loans_lenders as ll
         inner join loans lo ON lo.loan_id = ll.loan_id
         inner join lenders le ON le.permanent_name = ll.lender
where  le.country_code = 'CA'
order by lender_permanent_name, loan_id
""")

joined_df.createOrReplaceTempView("joined") 

## group and sort joined data

* group by on the exploded loans_lenders table (6 GB): count distinct loan_id by lender

In [5]:
# remove previous results if exists: 
import shutil
try: 
    shutil.rmtree('../../kiva/pyspark-result-groupby.csv')
except FileNotFoundError: 
    pass

lender_loan_count_df = spark.sql("""
select lender_permanent_name, count(distinct loan_id) as loan_ct
from   joined 
group by lender_permanent_name
-- order by count(distinct loan_id) desc
""")

lender_loan_count_df.createOrReplaceTempView("lender_loan_count")

lender_loan_count_df.coalesce(1).write.csv('../../kiva/pyspark-result-groupby.csv', header=True)

print('full ellapsed time: ', timeit.default_timer() - full_start)

full ellapsed time:  359.76395748999494
