In [1]:
from pyspark.sql.types import *
from handyspark import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import numpy as np

In [2]:
#####################
# Create SparkSession
#####################
sc, sql_sc = [None, None]
try:
    sc = SparkSession \
        .builder \
        .appName("spark-flo") \
        .config("spark.executor.memory", "30g") \
        .config("spark.driver.memory", "30g") \
        .config("spark.driver.allowMultipleContexts", "false") \
        .enableHiveSupport() \
        .getOrCreate()

    sql_sc = SQLContext(sc)
except SparkSessionError:
    print("Spark Session Failed to initialize.")
    pass

# Create dataframe from pandas dataframe

In [3]:
pdf1 = pd.DataFrame({'name':['Doraemon','Hachi','Shinchan','Felix'],'score':[24,68,21,49],'kind':['Cat','Bee','Human','Cat']})
pdf2 = pd.DataFrame({'name':['Nobita','Doraemon','Dorami','Ratatouile'],'score':[60,21,45,30],'kind':['Human','Cat','Cat','Mouse'],})

In [4]:
pdf1

Unnamed: 0,name,score,kind
0,Doraemon,24,Cat
1,Hachi,68,Bee
2,Shinchan,21,Human
3,Felix,49,Cat


In [5]:
pdf2

Unnamed: 0,name,score,kind
0,Nobita,60,Human
1,Doraemon,21,Cat
2,Dorami,45,Cat
3,Ratatouile,30,Mouse


In [6]:
sdf1 = sc.createDataFrame(pdf1)
sdf2 = sc.createDataFrame(pdf2)

In [7]:
sdf1.show()

+--------+-----+-----+
|    name|score| kind|
+--------+-----+-----+
|Doraemon|   24|  Cat|
|   Hachi|   68|  Bee|
|Shinchan|   21|Human|
|   Felix|   49|  Cat|
+--------+-----+-----+



In [8]:
sdf2.show()

+----------+-----+-----+
|      name|score| kind|
+----------+-----+-----+
|    Nobita|   60|Human|
|  Doraemon|   21|  Cat|
|    Dorami|   45|  Cat|
|Ratatouile|   30|Mouse|
+----------+-----+-----+



# Joins using SQL

In [9]:
sql_sc.registerDataFrameAsTable(sdf1,'table1')
sql_sc.registerDataFrameAsTable(sdf2,'table2')

In [11]:
sql_sc.sql("""    
select *
from table1
left join table2 on table1.name = table2.name
""").collect()

[Row(name='Felix', score=49, kind='Cat', name=None, score=None, kind=None),
 Row(name='Shinchan', score=21, kind='Human', name=None, score=None, kind=None),
 Row(name='Doraemon', score=24, kind='Cat', name='Doraemon', score=21, kind='Cat'),
 Row(name='Hachi', score=68, kind='Bee', name=None, score=None, kind=None)]

In [13]:
sql_sc.sql("""    
select *
from table1
left join table2 on table1.name = table2.name
""").show()

+--------+-----+-----+--------+-----+----+
|    name|score| kind|    name|score|kind|
+--------+-----+-----+--------+-----+----+
|   Felix|   49|  Cat|    null| null|null|
|Shinchan|   21|Human|    null| null|null|
|Doraemon|   24|  Cat|Doraemon|   21| Cat|
|   Hachi|   68|  Bee|    null| null|null|
+--------+-----+-----+--------+-----+----+



# Joins using PySpark

In [14]:
t1 = sdf1.alias('t1_alias')
t2 = sdf2.alias('t2_alias')

In [19]:
t1.join(t2, t1.name == t2.name, how='left').collect()

[Row(name='Felix', score=49, kind='Cat', name=None, score=None, kind=None),
 Row(name='Shinchan', score=21, kind='Human', name=None, score=None, kind=None),
 Row(name='Doraemon', score=24, kind='Cat', name='Doraemon', score=21, kind='Cat'),
 Row(name='Hachi', score=68, kind='Bee', name=None, score=None, kind=None)]

In [18]:
t1.join(t2, t1.name == t2.name, how='left').show()

+--------+-----+-----+--------+-----+----+
|    name|score| kind|    name|score|kind|
+--------+-----+-----+--------+-----+----+
|   Felix|   49|  Cat|    null| null|null|
|Shinchan|   21|Human|    null| null|null|
|Doraemon|   24|  Cat|Doraemon|   21| Cat|
|   Hachi|   68|  Bee|    null| null|null|
+--------+-----+-----+--------+-----+----+



In [22]:
sdf1.join(sdf2, sdf1.name == sdf2.name, how='left').show()

+--------+-----+-----+--------+-----+----+
|    name|score| kind|    name|score|kind|
+--------+-----+-----+--------+-----+----+
|   Felix|   49|  Cat|    null| null|null|
|Shinchan|   21|Human|    null| null|null|
|Doraemon|   24|  Cat|Doraemon|   21| Cat|
|   Hachi|   68|  Bee|    null| null|null|
+--------+-----+-----+--------+-----+----+



# Group By using SQL

### Count

In [21]:
sql_sc.sql("""    
select kind, count(*) as count_kind
from table1
group by kind
""").show()

+-----+----------+
| kind|count_kind|
+-----+----------+
|  Cat|         2|
|  Bee|         1|
|Human|         1|
+-----+----------+



### Average

In [28]:
sql_sc.sql("""    
select avg(score) as score_avg
from table1
""").show()

+---------+
|score_avg|
+---------+
|     40.5|
+---------+



### Order by

In [34]:
sql_sc.sql("""
select *
from table1
order by score
""").show()

+--------+-----+-----+
|    name|score| kind|
+--------+-----+-----+
|Shinchan|   21|Human|
|Doraemon|   24|  Cat|
|   Felix|   49|  Cat|
|   Hachi|   68|  Bee|
+--------+-----+-----+



### Filter using where 

In [36]:
sql_sc.sql("""
select *
from table1
where score < 30
""").show()

+--------+-----+-----+
|    name|score| kind|
+--------+-----+-----+
|Doraemon|   24|  Cat|
|Shinchan|   21|Human|
+--------+-----+-----+



### Filter using like

In [39]:
sql_sc.sql("""
select *
from table1
where lower(name) like 'dora%'
""").show()

+--------+-----+----+
|    name|score|kind|
+--------+-----+----+
|Doraemon|   24| Cat|
+--------+-----+----+



# Group By using PySpark

In [51]:
sdf1.groupBy('kind').count().show()

+-----+-----+
| kind|count|
+-----+-----+
|  Cat|    2|
|  Bee|    1|
|Human|    1|
+-----+-----+



In [50]:
sdf1.groupBy('kind').agg(count('kind').alias('count_kind')).show()

+-----+----------+
| kind|count_kind|
+-----+----------+
|  Cat|         2|
|  Bee|         1|
|Human|         1|
+-----+----------+



### Average

In [33]:
sdf1.select(mean(col('score')).alias('score_avg')).show()

+---------+
|score_avg|
+---------+
|     40.5|
+---------+



### Order by

In [35]:
sdf1.orderBy('score').show()

+--------+-----+-----+
|    name|score| kind|
+--------+-----+-----+
|Shinchan|   21|Human|
|Doraemon|   24|  Cat|
|   Felix|   49|  Cat|
|   Hachi|   68|  Bee|
+--------+-----+-----+



### Filter using where

In [38]:
sdf1.where(sdf1.score < 30).show()

+--------+-----+-----+
|    name|score| kind|
+--------+-----+-----+
|Doraemon|   24|  Cat|
|Shinchan|   21|Human|
+--------+-----+-----+



### Filter using like

In [40]:
sdf1.where(lower(col('name')).like("dora%")).show()

+--------+-----+----+
|    name|score|kind|
+--------+-----+----+
|Doraemon|   24| Cat|
+--------+-----+----+

