In [1]:
df = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"])
df2 = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], ["A", "B"])

In [2]:
display(df)

A,B
1,4
2,5
3,6


In [3]:
df.count()

In [4]:
df = df.withColumn('C',df.A + df.B)

In [5]:
display(df)

A,B,C
1,4,5
2,5,7
3,6,9


In [6]:
display(df[['A']])

A
1
2
3


In [7]:
from pyspark.sql.functions import lit
df.withColumn('C',lit(5)).show()

In [8]:
df.select('*',(df.A > 2).alias("State"),(lit(2)).alias("StateB"),(df.A+df.B).alias("StateC")).show()

In [9]:
from pyspark.sql import functions as F

In [10]:
df.withColumn('N',F.when((df.B > 5) & (df.C >9),20).when(df.A > 2,10).otherwise(1)).show()

In [11]:
df = spark.createDataFrame([('a',33), ('b',11), ('a',22)],['names','age'])

In [12]:
gb = df.groupBy(['names'])

In [13]:
display(gb.avg('age'))

names,avg(age)
b,11.0
a,27.5


In [14]:
display(gb.count())

names,count
b,1
a,2


In [15]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [16]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [17]:
def func(x):
  return int(x)+5

strToInt = udf( func, IntegerType())

In [18]:
df = spark.createDataFrame([('1',1),('2',2)],['a','b'])

In [19]:
df.show()

In [20]:
df.withColumn('X',strToInt(df.a)).show()

In [21]:
titanic_data = spark.read.csv('/FileStore/tables/titanic_train-ac800.csv', header=True, inferSchema=True)

In [22]:
def getStrLen(s):
  return len(s)

strLenUDF = udf( getStrLen, IntegerType())

In [23]:
titanic_data.withColumn('NameLen',strLenUDF(titanic_data.Name)).count()

In [24]:
@pandas_udf('int', PandasUDFType.SCALAR)
def strLenPUDF(c):
  return c.str.len()

titanic_data.withColumn('a',strLenPUDF(titanic_data.Name)).count()

In [25]:
df = spark.range(100)
sqr = udf(lambda x:x*x, IntegerType())
df = df.withColumn('sqr',df.id)
df = df.withColumn('key',df.sqr%3)
df.show()

In [26]:
import numpy as np
import pandas as pd

@pandas_udf("key int, sqr int", PandasUDFType.GROUPED_MAP)
def func(pdf):
  print (pdf)
  sqr = pdf.sqr
  r= pdf.assign(sqr=np.mean(pdf.sqr))
  print (r)
  return r

df.select(['key','sqr']).groupBy("key").apply(func).distinct().show()

In [27]:
import numpy as np

@pandas_udf("key int, sqr int", PandasUDFType.GROUPED_MAP)
def func(pdf):
  print (pdf)
  sqr = pdf.sqr
  return pdf.assign(sqr=(np.sum(pdf.sqr)))

df.select(['key','sqr']).groupBy("key").apply(func).distinct().show()

In [28]:
strLenPUDF(titanic_data.Name)

In [29]:
@pandas_udf('string', PandasUDFType.SCALAR)
def strLenPUDF(c):
  return c.map(lambda n: "May b Male" if "Mr." in n else "May be Female")

titanic_data.withColumn('FindGender',strLenPUDF(titanic_data.Name)).show()

In [30]:
stringJSONRDD = sc.parallelize((""" 
  { "id": "123",
    "name": "Katie",
    "age": 19,
    "eyeColor": "brown"
  }""",
   """{
    "id": "234",
    "name": "Michael",
    "age": 22,
    "eyeColor": "green"
  }""", 
  """{
    "id": "345",
    "name": "Simone",
    "age": 23,
    "eyeColor": "blue"
  }""")
)

In [31]:
swimmer = spark.read.json(stringJSONRDD)

In [32]:
swimmer.createOrReplaceTempView("s")

In [33]:
spark.sql("select * from s limit 2").show()

In [34]:
swimmer = swimmer.withColumn('age',lit(10))

In [35]:
swimmer.show()

In [36]:
spark.sql("select * from s where age > 20").show()

In [37]:
spark.catalog.listTables()

In [38]:
from pyspark.sql.types import *

stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'), (234, 'Michael', 22, 'green'), (345, 'Simone', 22, 11)])

In [39]:
from pyspark.sql.types import StringType, StructField, LongType, StructType

In [40]:
schemaString = "id name age eyeColor"

In [41]:
schema = StructType([
    StructField("id", LongType(), True),    
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

In [42]:
swimmers = spark.createDataFrame(stringCSVRDD, schema)

In [43]:
swimmers.write.csv("a.csv")

In [44]:
from pyspark.sql.types import Row

In [45]:
# Create the Departments
department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
department3 = Row(id='345678', name='Theater and Drama')
department4 = Row(id='901234', name='Indoor Recreation')

In [46]:
Employee = Row("firstName", "lastName", "email", "salary")

In [47]:
employee1 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 100000)
employee11 = Employee('michael', 'armbrust', 'no-reply@berkeley.edu', 200000)
employee2 = Employee('xiangrui', 'meng', 'no-reply@stanford.edu', 120000)
employee3 = Employee('matei', None, 'no-reply@waterloo.edu', 140000)
employee31 = Employee('matei', None, 'no-reply@waterloo.edu', 180000)
employee4 = Employee(None, 'wendell', 'no-reply@berkeley.edu', 160000)

In [48]:
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4, employee11])
departmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4, employee31])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])

In [49]:
df = spark.createDataFrame([departmentWithEmployees1,departmentWithEmployees2,departmentWithEmployees3,departmentWithEmployees4])

In [50]:
display(df)

department,employees
"List(123456, Computer Science)","List(List(michael, armbrust, no-reply@berkeley.edu, 100000), List(xiangrui, meng, no-reply@stanford.edu, 120000))"
"List(789012, Mechanical Engineering)","List(List(matei, null, no-reply@waterloo.edu, 140000), List(null, wendell, no-reply@berkeley.edu, 160000), List(michael, armbrust, no-reply@berkeley.edu, 200000))"
"List(345678, Theater and Drama)","List(List(michael, armbrust, no-reply@berkeley.edu, 100000), List(null, wendell, no-reply@berkeley.edu, 160000), List(matei, null, no-reply@waterloo.edu, 180000))"
"List(901234, Indoor Recreation)","List(List(xiangrui, meng, no-reply@stanford.edu, 120000), List(matei, null, no-reply@waterloo.edu, 140000))"


In [51]:
from pyspark.sql.functions import explode
df = df.select("*", explode("employees").alias("e"))

In [52]:
display(df)

department,employees,e
"List(123456, Computer Science)","List(List(michael, armbrust, no-reply@berkeley.edu, 100000), List(xiangrui, meng, no-reply@stanford.edu, 120000))","List(michael, armbrust, no-reply@berkeley.edu, 100000)"
"List(123456, Computer Science)","List(List(michael, armbrust, no-reply@berkeley.edu, 100000), List(xiangrui, meng, no-reply@stanford.edu, 120000))","List(xiangrui, meng, no-reply@stanford.edu, 120000)"
"List(789012, Mechanical Engineering)","List(List(matei, null, no-reply@waterloo.edu, 140000), List(null, wendell, no-reply@berkeley.edu, 160000), List(michael, armbrust, no-reply@berkeley.edu, 200000))","List(matei, null, no-reply@waterloo.edu, 140000)"
"List(789012, Mechanical Engineering)","List(List(matei, null, no-reply@waterloo.edu, 140000), List(null, wendell, no-reply@berkeley.edu, 160000), List(michael, armbrust, no-reply@berkeley.edu, 200000))","List(null, wendell, no-reply@berkeley.edu, 160000)"
"List(789012, Mechanical Engineering)","List(List(matei, null, no-reply@waterloo.edu, 140000), List(null, wendell, no-reply@berkeley.edu, 160000), List(michael, armbrust, no-reply@berkeley.edu, 200000))","List(michael, armbrust, no-reply@berkeley.edu, 200000)"
"List(345678, Theater and Drama)","List(List(michael, armbrust, no-reply@berkeley.edu, 100000), List(null, wendell, no-reply@berkeley.edu, 160000), List(matei, null, no-reply@waterloo.edu, 180000))","List(michael, armbrust, no-reply@berkeley.edu, 100000)"
"List(345678, Theater and Drama)","List(List(michael, armbrust, no-reply@berkeley.edu, 100000), List(null, wendell, no-reply@berkeley.edu, 160000), List(matei, null, no-reply@waterloo.edu, 180000))","List(null, wendell, no-reply@berkeley.edu, 160000)"
"List(345678, Theater and Drama)","List(List(michael, armbrust, no-reply@berkeley.edu, 100000), List(null, wendell, no-reply@berkeley.edu, 160000), List(matei, null, no-reply@waterloo.edu, 180000))","List(matei, null, no-reply@waterloo.edu, 180000)"
"List(901234, Indoor Recreation)","List(List(xiangrui, meng, no-reply@stanford.edu, 120000), List(matei, null, no-reply@waterloo.edu, 140000))","List(xiangrui, meng, no-reply@stanford.edu, 120000)"
"List(901234, Indoor Recreation)","List(List(xiangrui, meng, no-reply@stanford.edu, 120000), List(matei, null, no-reply@waterloo.edu, 140000))","List(matei, null, no-reply@waterloo.edu, 140000)"


In [53]:
df.selectExpr("department.id","department.name","e.firstname","e.email","e.salary").show()

In [54]:
sjr = sc.parallelize((""" 
  { "id": "{'h':"234"}",
    "eyeColor": ["brown","yellow"]
  }""",
   """{
    "id": {'h':"234"}""
    "eyeColor": ["green","purple"]
  }""", 
  """{
    "id": {'g':"234"},
    "eyeColor": ["blue"]
  }""")
)

In [55]:
df = spark.read.json(sjr)

In [56]:
df.select("*",explode(df.eyeColor).alias("c")).show()

In [57]:
data = spark.read.json('/FileStore/tables/mixed.json')

In [58]:
data.selectExpr("address.zip").show()

In [59]:
display(data)

a,address,id,orders,since
1.0,,101,"List(List(null, 100001))",2001
2.0,List(98016),102,,1999
3.1,,103,"List(List(B001, 300001), List(B005, 300002), List(A007, 300003))",foo


In [60]:
data_ex = data.select("*",explode("orders").alias("o")).show()

In [61]:
display(data)

a,address,id,orders,since
1.0,,101,"List(List(null, 100001))",2001
2.0,List(98016),102,,1999
3.1,,103,"List(List(B001, 300001), List(B005, 300002), List(A007, 300003))",foo


In [62]:
data.select("id",explode("orders")).show()

In [63]:
rdd = sc.textFile('/FileStore/tables/movie_json-7ce1e.txt')

In [64]:
rdd.filter(lambda x: len(x) != 0).map(lambda x: x.strip()).collect()

In [65]:
sc.textFile('/FileStore/tables/movie_json-7ce1e.txt').collect()

In [66]:
df = spark.read.json('/FileStore/tables/movie_json-7ce1e.txt')

In [68]:
df.show()

In [69]:
spark.read.json('/FileStore/tables/movie_json-7ce1e.txt').show()