In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row


spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

def mapper(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]),name=str(fields[1]), \
        age=int(fields[2]),numFriends=int(fields[3]))
    
lines = spark.sparkContext.textFile("../resources/sources/fakefriends.csv")
people = lines.map(mapper)
schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView('people')
teenagers = spark.sql('SELECT * FROM people WHERE age >= 13 and age <= 19')
teenagers.show(5)
spark.stop()

+---+--------+---+----------+
| ID|    name|age|numFriends|
+---+--------+---+----------+
|  0|    Will| 33|       385|
|  1|Jean-Luc| 26|         2|
|  2|    Hugh| 55|       221|
|  3|  Deanna| 40|       465|
|  4|   Quark| 68|        21|
+---+--------+---+----------+
only showing top 5 rows



In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkSQL').getOrCreate()
    
people = spark.read.option('header','true').option('inferSchema','true').csv("../resources/sources/fakefriends.csv")

print('Here is our inferred schema:')
people.printSchema()

print('Lets display the name column:')
people.select('name').show()

print('Filter out anyone over 21:')
people.filter(people.age < 21).show()

print('Group by age')
people.groupBy('age').count().show()

print('Make everyone 10 years older:')
people.select(people.name,people.age + 10).show()

spark.stop()

Group by age
+---+-----+
|age|count|
+---+-----+
| 31|    8|
| 65|    5|
| 53|    7|
| 34|    6|
| 28|   10|
| 26|   17|
| 27|    8|
| 44|   12|
| 22|    7|
| 47|    9|
| 52|   11|
| 40|   17|
| 20|    5|
| 57|   12|
| 54|   13|
| 48|   10|
| 19|   11|
| 64|   12|
| 41|    9|
| 43|    7|
+---+-----+
only showing top 20 rows



In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import try_avg,round

spark = SparkSession.builder.appName('numberOfFriendsByAge').getOrCreate()
    
people = spark.read.option('header','true').option('inferSchema','true').csv("../resources/sources/fakefriends.csv")

people.createOrReplaceTempView('people')

filteredPeople = spark.sql('select age,number_of_friends from people')

finalDataframe = filteredPeople.groupBy('age').agg(round(try_avg('number_of_friends'),2).alias('avg_friends')).sort('age').show()

spark.stop()

+---+-----------+
|age|avg_friends|
+---+-----------+
| 18|     343.38|
| 19|     213.27|
| 20|      165.0|
| 21|     350.88|
| 22|     206.43|
| 23|      246.3|
| 24|      233.8|
| 25|     197.45|
| 26|     242.06|
| 27|     228.13|
| 28|      209.1|
| 29|     215.92|
| 30|     235.82|
| 31|     267.25|
| 32|     207.91|
| 33|     325.33|
| 34|      245.5|
| 35|     211.63|
| 36|      246.6|
| 37|     249.33|
+---+-----------+
only showing top 20 rows



In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun 
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType

spark = SparkSession.builder.appName('minimumTemperatures').getOrCreate()

schema = StructType([\
    StructField('stationID',StringType(),True), \
    StructField('date',IntegerType(),True), \
    StructField('measureType',StringType(),True), \
    StructField('temperature',FloatType(),True)])

df = spark.read.schema(schema).csv('../resources/sources/1800.csv')

minTemps = df.filter(df.measureType == 'TMIN')

stationTemps = minTemps.select('stationID','temperature')

minTempsByStation = stationTemps.groupby('stationID').min('temperature')

minTempsByStationF = minTempsByStation.withColumn('temperature',fun.round(fun.col('min(temperature)') * 0.1, 2))

minTempsByStationF.show()

spark.stop()

+-----------+----------------+-----------+
|  stationID|min(temperature)|temperature|
+-----------+----------------+-----------+
|ITE00100554|          -148.0|      -14.8|
|EZE00100082|          -135.0|      -13.5|
+-----------+----------------+-----------+



In [53]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun 
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

spark = SparkSession.builder.appName('aggregateByCustomer').getOrCreate()

schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("itemID", IntegerType(), True),
    StructField("amount", DoubleType(), True)
])

df = (spark.read
           .option("header", "false")
           .schema(schema)
           .csv("../resources/sources/customer-orders.csv")
     )

transformedDf = (
    df.groupBy("userID")
      .agg(fun.round(fun.sum("amount"), 2).alias("totalAmount"))
      .sort('totalAmount',ascending=False)
)

transformedDf.show()

spark.stop()


+------+-----------+
|userID|totalAmount|
+------+-----------+
|    68|    6375.45|
|    73|     6206.2|
|    39|    6193.11|
|    54|    6065.39|
|    71|    5995.66|
|     2|    5994.59|
|    97|    5977.19|
|    46|    5963.11|
|    42|    5696.84|
|    59|    5642.89|
|    41|    5637.62|
|     0|    5524.95|
|     8|    5517.24|
|    85|    5503.43|
|    61|    5497.48|
|    32|    5496.05|
|    58|    5437.73|
|    63|    5415.15|
|    15|    5413.51|
|     6|    5397.88|
+------+-----------+
only showing top 20 rows



In [58]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun 
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

spark = (SparkSession.builder
         .master("local[*]")   # <-- Add this so Spark uses all local cores
         .appName("popularMovieDf")
         .getOrCreate())

#df = (spark.read
#           .text("../resources/sources/u.data")
#     )
#
#df.show()

spark.stop()

ConnectionRefusedError: [Errno 111] Connection refused