## Simple experiments using SQL and dataframes

In [23]:
import findspark
findspark.init('/home/rich/spark/spark-2.4.3-bin-hadoop2.7')
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pandas as pd

In [8]:
spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

In [56]:
#look at the data with pandas
columns = ['index','name','age','numFriends']
data = pd.read_csv('./data/fakefriends.csv',header=None,names=columns)
data.head()

Unnamed: 0,index,name,age,numFriends
0,0,Will,33,385
1,1,Jean-Luc,26,2
2,2,Hugh,55,221
3,3,Deanna,40,465
4,4,Quark,68,21


In [70]:
#make a Row with the named columns
def mapper(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]), name=str(fields[1].encode("utf-8")), age=int(fields[2]), numFriends=int(fields[3]))

In [71]:
#get RDD
lines = spark.sparkContext.textFile('./data/fakefriends.csv')
#make a 'dataframe'
people = lines.map(mapper)

In [74]:
#infer the schema and register the dataframe as a table
schemaPeople = spark.createDataFrame(people).cache()
#create temp sql table in memory caleed people
schemaPeople.createOrReplaceTempView('people')

In [78]:
teenagers = spark.sql("SELECT * FROM people WHERE age >=13 and age <= 19")
#result is RDD, can use normal RDD operations

for teen in teenagers.collect():
    print(teen)

Row(ID=21, age=19, name="b'Miles'", numFriends=268)
Row(ID=52, age=19, name="b'Beverly'", numFriends=269)
Row(ID=54, age=19, name="b'Brunt'", numFriends=5)
Row(ID=106, age=18, name="b'Beverly'", numFriends=499)
Row(ID=115, age=18, name="b'Dukat'", numFriends=397)
Row(ID=133, age=19, name="b'Quark'", numFriends=265)
Row(ID=136, age=19, name="b'Will'", numFriends=335)
Row(ID=225, age=19, name="b'Elim'", numFriends=106)
Row(ID=304, age=19, name="b'Will'", numFriends=404)
Row(ID=341, age=18, name="b'Data'", numFriends=326)
Row(ID=366, age=19, name="b'Keiko'", numFriends=119)
Row(ID=373, age=19, name="b'Quark'", numFriends=272)
Row(ID=377, age=18, name="b'Beverly'", numFriends=418)
Row(ID=404, age=18, name="b'Kasidy'", numFriends=24)
Row(ID=409, age=19, name="b'Nog'", numFriends=267)
Row(ID=439, age=18, name="b'Data'", numFriends=417)
Row(ID=444, age=18, name="b'Keiko'", numFriends=472)
Row(ID=492, age=19, name="b'Dukat'", numFriends=36)
Row(ID=494, age=18, name="b'Kasidy'", numFriends=194)

In [81]:
#use functions on dataframe instead of SQL queries 
schemaPeople.groupBy('age').count().orderBy('age').show()

+---+-----+
|age|count|
+---+-----+
| 18|    8|
| 19|   11|
| 20|    5|
| 21|    8|
| 22|    7|
| 23|   10|
| 24|    5|
| 25|   11|
| 26|   17|
| 27|    8|
| 28|   10|
| 29|   12|
| 30|   11|
| 31|    8|
| 32|   11|
| 33|   12|
| 34|    6|
| 35|    8|
| 36|   10|
| 37|    9|
+---+-----+
only showing top 20 rows

