In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructField,StructType,IntegerType,StringType,TimestampType,ArrayType,FloatType,DoubleType
from pyspark.sql.functions import to_date,col
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark import SparkConf
import os
import psycopg2
import pandas as pd

In [2]:
findspark.init()
# findspark.add_packages('file:///C:/Program Files (x86)/MySQL/Connector J 8.0/mysql-connector-java-8.0.23.jar')

In [3]:
try: spark.stop()
except: pass

In [4]:
spark=SparkSession.builder.appName("SparkDF").config(
    "spark.driver.extraClassPath", "mysql-connector-java-8.0.23.jar"
).enableHiveSupport().getOrCreate()
sc=spark.sparkContext

In [5]:
sc.getConf().getAll()

[('spark.driver.host', '169.254.181.150'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.extraClassPath', 'mysql-connector-java-8.0.23.jar'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '11834'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'SparkDF'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1632120412187'),
 ('spark.ui.showConsoleProgress', 'true')]

In [6]:
sc.getConf().getAll()

[('spark.driver.host', '169.254.181.150'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.extraClassPath', 'mysql-connector-java-8.0.23.jar'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '11834'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'SparkDF'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1632120412187'),
 ('spark.ui.showConsoleProgress', 'true')]

# Creating DF (From RDDs)

In [7]:
rdd1=sc.textFile("data/ml-100k/u.item")
rdd1.take(3)

['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0',
 '2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0']

In [8]:
# df1.coalesce(1).write.save("item.parquet",format="parquet")

In [9]:
rdd1_split=rdd1.map(lambda x: x.split("|"))

In [10]:
rdd1_data=rdd1_split.map(lambda x: Row(id=int(x[0]), title=x[1],date=x[2],link=x[4],rating=[int(i) for i in x[5:]]))
print(rdd1_data.take(3))

[Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', rating=[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], title='Toy Story (1995)'), Row(date='01-Jan-1995', id=2, link='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', rating=[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], title='GoldenEye (1995)'), Row(date='01-Jan-1995', id=3, link='http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)', rating=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], title='Four Rooms (1995)')]


In [61]:
# way-1
df1=spark.createDataFrame(rdd1_data)
print(df1.printSchema())
print(df1.show(15))

root
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- link: string (nullable = true)
 |-- rating: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- title: string (nullable = true)

None
+-----------+---+--------------------+--------------------+--------------------+
|       date| id|                link|              rating|               title|
+-----------+---+--------------------+--------------------+--------------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|    Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|    GoldenEye (1995)|
|01-Jan-1995|  3|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|   Four Rooms (1995)|
|01-Jan-1995|  4|http://us.imdb.co...|[0, 1, 0, 0, 0, 1...|   Get Shorty (1995)|
|01-Jan-1995|  5|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|      Copycat (1995)|
|01-Jan-1995|  6|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Shanghai Triad (Y...|
|01-Jan-1995|  7|http://us.imdb.co..

In [12]:
# way-2
schema2=StructType([
    StructField("id",IntegerType(),True),
    StructField("title",StringType(),True),
    StructField("date",StringType(),True),
    StructField("rating",ArrayType(elementType=IntegerType()),True),
])
df2=spark.createDataFrame(rdd1_data,schema2) \
    .withColumn("date",to_date(col("date"),"dd-MMM-yyyy"))
df2.show(5)

+---+-----------------+----------+--------------------+
| id|            title|      date|              rating|
+---+-----------------+----------+--------------------+
|  1| Toy Story (1995)|1995-01-01|[0, 0, 0, 1, 1, 1...|
|  2| GoldenEye (1995)|1995-01-01|[0, 1, 1, 0, 0, 0...|
|  3|Four Rooms (1995)|1995-01-01|[0, 0, 0, 0, 0, 0...|
|  4|Get Shorty (1995)|1995-01-01|[0, 1, 0, 0, 0, 1...|
|  5|   Copycat (1995)|1995-01-01|[0, 0, 0, 0, 0, 0...|
+---+-----------------+----------+--------------------+
only showing top 5 rows



In [13]:
# way-3
def mapping(x):
    return (int(x[0]), (x[1][0:-7],x[1][-5:-1]),x[2],[int(i) for i in x[5:]])

rdd1_data2=rdd1_split.map(mapping)
print(rdd1_data2.take(3))

schema2=StructType([
    StructField("id",IntegerType(),True),
    StructField("title_year", StructType([
                StructField("title",StringType(), True),
                StructField("year",StringType(), True),
            ])),
    StructField("date",StringType(),True),
    StructField("rating",ArrayType(elementType=IntegerType()),True),
    
])
df3=spark.createDataFrame(rdd1_data2,schema2) \
    .withColumn("date",to_date(col("date"),"dd-MMM-yyyy"))
df3.show()

[(1, ('Toy Story', '1995'), '01-Jan-1995', [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), (2, ('GoldenEye', '1995'), '01-Jan-1995', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), (3, ('Four Rooms', '1995'), '01-Jan-1995', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])]
+---+--------------------+----------+--------------------+
| id|          title_year|      date|              rating|
+---+--------------------+----------+--------------------+
|  1|   [Toy Story, 1995]|1995-01-01|[0, 0, 0, 1, 1, 1...|
|  2|   [GoldenEye, 1995]|1995-01-01|[0, 1, 1, 0, 0, 0...|
|  3|  [Four Rooms, 1995]|1995-01-01|[0, 0, 0, 0, 0, 0...|
|  4|  [Get Shorty, 1995]|1995-01-01|[0, 1, 0, 0, 0, 1...|
|  5|     [Copycat, 1995]|1995-01-01|[0, 0, 0, 0, 0, 0...|
|  6|[Shanghai Triad (...|1995-01-01|[0, 0, 0, 0, 0, 0...|
|  7|[Twelve Monkeys, ...|1995-01-01|[0, 0, 0, 0, 0, 0...|
|  8|        [Babe, 1995]|1995-01-01|[0, 0, 0, 0, 1, 1...|
|  9|[Dead Man Walking...|1995-01-01|[0, 0,

In [14]:
# way-4
#  use list or tuple
df4 = spark.createDataFrame(
    [
        (1, None, "Brown"), 
        (None, "Tom", None), 
        (None, "Jerry", "Fox"), 
        [None, "Jerry", "Fox"], # list
        (3, "Joshua", "Peterson"), # tuple
        (3, "Joshua", "Peterson"),
        [None, None, None] 
    ], 
    ('id', 'firstName', 'lastName')
)
print(df4.show())
print(df4.printSchema())

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|   1|     null|   Brown|
|null|      Tom|    null|
|null|    Jerry|     Fox|
|null|    Jerry|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
|null|     null|    null|
+----+---------+--------+

None
root
 |-- id: long (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)

None


In [15]:
pdf = pd.DataFrame({
        'x1': ['a','a','b','b', 'c', 'd'],
        'x2': ['apple', 'orange', 'orange','orange', 'peach', 'peach'],
        'x3': [1, 1, 2, 2, 2, 4],
        'x4': [2.4, 2.5, 3.5, 1.4, 2.1,1.5],
        'y1': [1, 0, 1, 0, 0, 1],
        'y2': ['yes', 'no', 'no', 'yes', 'yes', 'yes']
    })
df14 = spark.createDataFrame(pdf)
df14.show()

+---+------+---+---+---+---+
| x1|    x2| x3| x4| y1| y2|
+---+------+---+---+---+---+
|  a| apple|  1|2.4|  1|yes|
|  a|orange|  1|2.5|  0| no|
|  b|orange|  2|3.5|  1| no|
|  b|orange|  2|1.4|  0|yes|
|  c| peach|  2|2.1|  0|yes|
|  d| peach|  4|1.5|  1|yes|
+---+------+---+---+---+---+



# DF to RDD

In [16]:
df14.rdd.collect()

[Row(x1='a', x2='apple', x3=1, x4=2.4, y1=1, y2='yes'),
 Row(x1='a', x2='orange', x3=1, x4=2.5, y1=0, y2='no'),
 Row(x1='b', x2='orange', x3=2, x4=3.5, y1=1, y2='no'),
 Row(x1='b', x2='orange', x3=2, x4=1.4, y1=0, y2='yes'),
 Row(x1='c', x2='peach', x3=2, x4=2.1, y1=0, y2='yes'),
 Row(x1='d', x2='peach', x3=4, x4=1.5, y1=1, y2='yes')]

In [94]:
df14.rdd.filter(lambda x: x["x2"]=="orange").collect()

[Row(x1='a', x2='orange', x3=1, x4=2.5, y1=0, y2='no'),
 Row(x1='b', x2='orange', x3=2, x4=3.5, y1=1, y2='no'),
 Row(x1='b', x2='orange', x3=2, x4=1.4, y1=0, y2='yes')]

In [95]:
df14.rdd.filter(lambda x: x[1]=="orange").collect()

[Row(x1='a', x2='orange', x3=1, x4=2.5, y1=0, y2='no'),
 Row(x1='b', x2='orange', x3=2, x4=3.5, y1=1, y2='no'),
 Row(x1='b', x2='orange', x3=2, x4=1.4, y1=0, y2='yes')]

# Creating DF (From Spark Data Sources)
- path = path to file
- comment = specify char or int, and lines starting from those will be skipped from DF
- sep = define how row data is separated
- nullValue = define those values as NULL
- inferSchema = define data types and auto detect blank values as null
- header = define 1st row as header

by default an elem under quotes like "dms" is treated as " " "dms" " " 
- escape='' sets it to " "dms" "
- escape=' " ' sets it to "dms"
- escape=" ' " sets it to " " "dms" " " 

In [74]:
df_csv1=spark.read.csv("data/withnull1.csv",sep=",",nullValue="NONE",inferSchema=True)
df_csv2=spark.read.csv("data/withnull2.csv",comment="N",escape="'",sep=",",nullValue="NONE",header=True,inferSchema=True,encoding="UTF-8")
df_csv3=spark.read.csv("data/withnull1.csv",sep=",",nullValue="NONE")
df_csv4=spark.read.csv("data/withnull2.csv",comment=1,sep=",",nullValue="NONE",header=True,encoding="UTF-8")
print(df_csv1.show(3))
df_csv1.printSchema()
df_csv2.show(3)
df_csv2.printSchema()
df_csv3.show(3)
df_csv3.printSchema()
df_csv4.show(3)
df_csv4.printSchema()

+----+----+-----+
| _c0| _c1|  _c2|
+----+----+-----+
|   1|    |Brown|
|null|null| null|
|null|null| null|
+----+----+-----+
only showing top 3 rows

None
root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)

+---+---------+--------+
| ID|    Fname|   Lname|
+---+---------+--------+
|  1|"""dms"""|   Brown|
|  3|   Joshua|Peterson|
|  3|   Joshua|Peterson|
+---+---------+--------+

root
 |-- ID: integer (nullable = true)
 |-- Fname: string (nullable = true)
 |-- Lname: string (nullable = true)

+---+-----+-----+
|_c0|  _c1|  _c2|
+---+-----+-----+
|  1|     |Brown|
|   |  Tom|     |
|   |Jerry|  Fox|
+---+-----+-----+
only showing top 3 rows

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)

+----+-----+-----+
|  ID|Fname|Lname|
+----+-----+-----+
|null|  Tom| null|
|null|Jerry|  Fox|
|null|Jerry|  Fox|
+----+-----+-----+
only showing top 3 rows

root
 |-- ID: string 

In [75]:
spark.read.text("data/textFiles/sample1.txt").show(3)

+--------------------+
|               value|
+--------------------+
|Utilitatis causa ...|
|Lorem ipsum dolor...|
|                    |
+--------------------+
only showing top 3 rows



In [76]:
# from database
# restart jupyter if it doesn't work
user='root';pw='24081999'
table_name='users'
# postgresql://[user[:password]@][netloc][:port][/dbname][?param1=value1&...]
url="jdbc:mysql://localhost:3306/oneqshopv1"
properties = {
    "user": user,
    "password": pw
}
df_db1=spark.read.option("driver", "com.mysql.jdbc.Driver").jdbc(url=url,table=table_name,properties=properties)
df_db1.show()

+---+--------------------+--------------------+----------+---------+--------------------+------------------+-------------+--------------------+------+-----+-----+----------+----------+
| id|            password|          last_login|first_name|last_name|               email|          username| phone_number|             picture|active|staff|admin|all_logout|is_deleted|
+---+--------------------+--------------------+----------+---------+--------------------+------------------+-------------+--------------------+------+-----+-----+----------+----------+
|  1|pbkdf2_sha256$180...|2021-05-27 16:14:...|          | Silveira|dms24081999@gmail...|             admin|+919594183245|media/public/prof...|  true| true| true|          |     false|
|  2|pbkdf2_sha256$180...|                null|          |         |       user1@dms.com|             user1|         null|                    |  true|false|false|          |     false|
|  3|pbkdf2_sha256$180...|                null|          |         |       

### Array data types must be same

In [77]:
df_datatype = spark.createDataFrame( [('a',[1,"a"]), ('a',[1,"a"]), ('a',[1,"a"]),
                              ('a',[1,"a"]), ('b',[1,"a"]), ('b',[1,"a"])], ('key', 'val') )
print(df_datatype.show(2))
df_datatype = spark.createDataFrame( [('a',[1,2]), ('a',[1,2]), ('a',[1,5]),
                              ('a',[1,2]), ('b',[1,3]), ('b',[1,4])], ('key', 'val') )
print(df_datatype.show(2))

+---+----+
|key| val|
+---+----+
|  a|[1,]|
|  a|[1,]|
+---+----+
only showing top 2 rows

None
+---+------+
|key|   val|
+---+------+
|  a|[1, 2]|
|  a|[1, 2]|
+---+------+
only showing top 2 rows

None


### duplicate cols get auto created

In [78]:
pdf=[(1,'a',4,'a',4.1,'d'),(2,'b',3,'b',3.2,'c'),(3,'c',2,'c',2.3,'b'),(1,'d',1,'d',1.4,'a')]
df15 = spark.createDataFrame(pdf, ('x','y','z','a','b','a') )
df15.show(2)

try: df15.select(df15.a).show(2)
except: print("failed")
    
df15.columns

try: df15.select(df15.columns[3]).show(2)
except: print("failed")
    
df15.withColumnRenamed('a', 'b_id').show(2)
df15.drop('a').show(2)

new_cols = ['x','y','z','a','b','a_dup']
df15.toDF(*new_cols).show(2)

+---+---+---+---+---+---+
|  x|  y|  z|  a|  b|  a|
+---+---+---+---+---+---+
|  1|  a|  4|  a|4.1|  d|
|  2|  b|  3|  b|3.2|  c|
+---+---+---+---+---+---+
only showing top 2 rows

failed
failed
+---+---+---+----+---+----+
|  x|  y|  z|b_id|  b|b_id|
+---+---+---+----+---+----+
|  1|  a|  4|   a|4.1|   d|
|  2|  b|  3|   b|3.2|   c|
+---+---+---+----+---+----+
only showing top 2 rows

+---+---+---+---+
|  x|  y|  z|  b|
+---+---+---+---+
|  1|  a|  4|4.1|
|  2|  b|  3|3.2|
+---+---+---+---+
only showing top 2 rows

+---+---+---+---+---+-----+
|  x|  y|  z|  a|  b|a_dup|
+---+---+---+---+---+-----+
|  1|  a|  4|  a|4.1|    d|
|  2|  b|  3|  b|3.2|    c|
+---+---+---+---+---+-----+
only showing top 2 rows



# Merge and split columns

In [117]:
df1rdd_merge=df1.rdd.map(lambda x: Row(model=x[1], values=list([x[0]])+list(x[2:])))
df1rdd_merge.take(3)

[Row(model=1, values=['01-Jan-1995', 'http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'Toy Story (1995)']),
 Row(model=2, values=['01-Jan-1995', 'http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 'GoldenEye (1995)']),
 Row(model=3, values=['01-Jan-1995', 'http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 'Four Rooms (1995)'])]

In [120]:
df1rdd_split=df1rdd_merge.map(lambda x: Row(model=x[0], values1=x[1][1:3],values2=x[1][3:]))
df1rdd_split.take(3)

[Row(model=1, values1=['http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], values2=['Toy Story (1995)']),
 Row(model=2, values1=['http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]], values2=['GoldenEye (1995)']),
 Row(model=3, values1=['http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]], values2=['Four Rooms (1995)'])]

# Inspect Data

In [21]:
df1.dtypes # Return df column names and data types

[('date', 'string'),
 ('id', 'bigint'),
 ('link', 'string'),
 ('rating', 'array<bigint>'),
 ('title', 'string')]

In [24]:
# default: vertical=False, Truncate=True
df1.show(2,vertical=True,truncate=True)
df1.show(2,vertical=True,truncate=False)

-RECORD 0----------------------
 date   | 01-Jan-1995          
 id     | 1                    
 link   | http://us.imdb.co... 
 rating | [0, 0, 0, 1, 1, 1... 
 title  | Toy Story (1995)     
-RECORD 1----------------------
 date   | 01-Jan-1995          
 id     | 2                    
 link   | http://us.imdb.co... 
 rating | [0, 1, 1, 0, 0, 0... 
 title  | GoldenEye (1995)     
only showing top 2 rows

-RECORD 0-----------------------------------------------------------
 date   | 01-Jan-1995                                               
 id     | 1                                                         
 link   | http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)     
 rating | [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 
 title  | Toy Story (1995)                                          
-RECORD 1-----------------------------------------------------------
 date   | 01-Jan-1995                                               
 id     | 2                            

In [47]:
df1.show(df1.count(),truncate=False) # show all with help of count

+-----------+----+--------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------+
|date       |id  |link                                                                                                                                  |rating                                                   |title                                                                            |
+-----------+----+--------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------+
|01-Jan-1995|1   |http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)                                               

In [27]:
print(df1.head(2),end="\n\n") # Return first n rows
print(df1.head()) # Return first row

[Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', rating=[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], title='Toy Story (1995)'), Row(date='01-Jan-1995', id=2, link='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', rating=[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], title='GoldenEye (1995)')]

Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', rating=[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], title='Toy Story (1995)')


In [63]:
df1.first() # Return first row

Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', rating=[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], title='Toy Story (1995)')

In [64]:
df1.take(2) # Return the first n rows

[Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', rating=[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], title='Toy Story (1995)'),
 Row(date='01-Jan-1995', id=2, link='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', rating=[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], title='GoldenEye (1995)')]

In [65]:
df1.schema # Return the schema of df

StructType(List(StructField(date,StringType,true),StructField(id,LongType,true),StructField(link,StringType,true),StructField(rating,ArrayType(LongType,true),true),StructField(title,StringType,true)))

In [66]:
df1.printSchema() # Return the schema of df

root
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- link: string (nullable = true)
 |-- rating: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- title: string (nullable = true)



In [28]:
df1.describe().show(vertical=True,truncate=False) # Compute summary statistics

-RECORD 0---------------------------------------------------------------
 summary | count                                                        
 date    | 1682                                                         
 id      | 1682                                                         
 link    | 1682                                                         
 title   | 1682                                                         
-RECORD 1---------------------------------------------------------------
 summary | mean                                                         
 date    | null                                                         
 id      | 841.5                                                        
 link    | null                                                         
 title   | null                                                         
-RECORD 2---------------------------------------------------------------
 summary | stddev                                  

In [99]:
df1.describe("id").show()

+-------+------------------+
|summary|                id|
+-------+------------------+
|  count|              1682|
|   mean|             841.5|
| stddev|485.69589250888254|
|    min|                 1|
|    max|              1682|
+-------+------------------+



In [68]:
df1.columns # Return the columns of df

['date', 'id', 'link', 'rating', 'title']

In [69]:
df1.count() # Count the number of rows in df

1682

In [70]:
df1.distinct().count() # Count the number of distinct rows in df

1682

In [42]:
df1.filter(df1.id>2).explain() # Print the (logical and physical) plans

== Physical Plan ==
*(1) Filter (isnotnull(id#1L) && (id#1L > 2))
+- Scan ExistingRDD[date#0,id#1L,link#2,rating#3,title#4]


# Duplicate Values

In [96]:
print(df4.count())
print(df4.dropDuplicates().count())
print(df4.dropDuplicates(["id"]).count())
print(df4.dropDuplicates(["lastName","firstName"]).count())
print(df4.dropDuplicates(["firstName","lastName"]).count())

7
5
3
5
5


In [32]:
df4.show()

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|   1|     null|   Brown|
|null|      Tom|    null|
|null|    Jerry|     Fox|
|null|    Jerry|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
|null|     null|    null|
+----+---------+--------+



# Queries (Select and filter)

In [47]:
df1.select("id").show(2)

+---+
| id|
+---+
|  1|
|  2|
+---+
only showing top 2 rows



In [68]:
df1.select("id","title").show(2)

+---+----------------+
| id|           title|
+---+----------------+
|  1|Toy Story (1995)|
|  2|GoldenEye (1995)|
+---+----------------+
only showing top 2 rows



### selecting repeating col cause duplicate cols

In [74]:
df1_filter_col = df1.id
print(df1_filter_col)
df1_all_col= [eval('df1.' + c) for c in df1.columns]
print(df1_all_col)
df1_col = [df1_filter_col] + df1_all_col
df1_index=df1.select(df1_col)
df1_index.show(2)

Column<b'id'>
[Column<b'date'>, Column<b'id'>, Column<b'link'>, Column<b'rating'>, Column<b'title'>]
+---+-----------+---+--------------------+--------------------+----------------+
| id|       date| id|                link|              rating|           title|
+---+-----------+---+--------------------+--------------------+----------------+
|  1|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
|  2|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|
+---+-----------+---+--------------------+--------------------+----------------+
only showing top 2 rows



# Queries (cast)

In [27]:
df1.select(col("id").cast(DoubleType()),"title",col("id").cast("double")).show(2)

+---+----------------+---+
| id|           title| id|
+---+----------------+---+
|1.0|Toy Story (1995)|1.0|
|2.0|GoldenEye (1995)|2.0|
+---+----------------+---+
only showing top 2 rows



# Queries (selectExpr)

In [26]:
df1.selectExpr("cast(id as float)","title","rating[0] as rating0",
              "rating[1] rating1").show(2)

+---+----------------+-------+-------+
| id|           title|rating0|rating1|
+---+----------------+-------+-------+
|1.0|Toy Story (1995)|      0|      0|
|2.0|GoldenEye (1995)|      0|      1|
+---+----------------+-------+-------+
only showing top 2 rows



### array (must have same data type)

In [78]:
df1_all_col= [eval('df1.' + c) for c in ['title','link']]
df1_index.select(F.array(df1_all_col)).limit(2).toPandas()

Unnamed: 0,"array(title, link)"
0,"[Toy Story (1995), http://us.imdb.com/M/title-..."
1,"[GoldenEye (1995), http://us.imdb.com/M/title-..."


In [170]:
df1.select('title', F.array_contains(df1.rating, 2).alias('new_features')).show(2)
df1.select('title', F.array_contains(df1.rating, 1).alias('new_features')).show(2)

+----------------+------------+
|           title|new_features|
+----------------+------------+
|Toy Story (1995)|       false|
|GoldenEye (1995)|       false|
+----------------+------------+
only showing top 2 rows

+----------------+------------+
|           title|new_features|
+----------------+------------+
|Toy Story (1995)|        true|
|GoldenEye (1995)|        true|
+----------------+------------+
only showing top 2 rows



In [177]:
df1.filter(F.array_contains(df1.rating, 2)).show(2)
df1.filter(F.array_contains(df1.rating, 1)).show(2)

+----+---+----+------+-----+
|date| id|link|rating|title|
+----+---+----+------+-----+
+----+---+----+------+-----+

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|
+-----------+---+--------------------+--------------------+----------------+
only showing top 2 rows



### Select specific rows

In [175]:
df1_index.filter(df1_index.id.isin([1,2,6,9])).show()

+---+-----------+---+--------------------+--------------------+--------------------+
| id|       date| id|                link|              rating|               title|
+---+-----------+---+--------------------+--------------------+--------------------+
|  1|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|    Toy Story (1995)|
|  2|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|    GoldenEye (1995)|
|  6|01-Jan-1995|  6|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Shanghai Triad (Y...|
|  9|01-Jan-1995|  9|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Dead Man Walking ...|
+---+-----------+---+--------------------+--------------------+--------------------+



### Select rows by a cutoff index

In [37]:
df1_index.filter(df1_index.id > 9).show(3)

+---+-----------+---+--------------------+--------------------+--------------------+
| id|       date| id|                link|              rating|               title|
+---+-----------+---+--------------------+--------------------+--------------------+
| 10|22-Jan-1996| 10|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|  Richard III (1995)|
| 11|01-Jan-1995| 11|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Seven (Se7en) (1995)|
| 12|14-Aug-1995| 12|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Usual Suspects, T...|
+---+-----------+---+--------------------+--------------------+--------------------+
only showing top 3 rows



### modify data

In [41]:
df1.select(col("id") * 100).show(2)

+----------+
|(id * 100)|
+----------+
|       100|
|       200|
+----------+
only showing top 2 rows



In [67]:
# Show all entries in title and id, add 1 to the entries of age
df1.select(df1.title,df1.id+1).show(3)
df1.select(df1["title"],df1["id"]+1).show(3)

+-----------------+--------+
|            title|(id + 1)|
+-----------------+--------+
| Toy Story (1995)|       2|
| GoldenEye (1995)|       3|
|Four Rooms (1995)|       4|
+-----------------+--------+
only showing top 3 rows

+-----------------+--------+
|            title|(id + 1)|
+-----------------+--------+
| Toy Story (1995)|       2|
| GoldenEye (1995)|       3|
|Four Rooms (1995)|       4|
+-----------------+--------+
only showing top 3 rows



### lit

In [58]:
df1.select(df1.title,F.lit(1),F.lit("H")).show(3)
df1.select(df1.title,F.lit(1),F.lit("H")).printSchema()

+-----------------+---+---+
|            title|  1|  H|
+-----------------+---+---+
| Toy Story (1995)|  1|  H|
| GoldenEye (1995)|  1|  H|
|Four Rooms (1995)|  1|  H|
+-----------------+---+---+
only showing top 3 rows

root
 |-- title: string (nullable = true)
 |-- 1: integer (nullable = false)
 |-- H: string (nullable = false)



### isin

In [176]:
df1.select(df1.id.isin([2,3])).show(4)
df1[df1.title.isin("Toy Story (1995)","Four")].show(3)
df1.filter(df1.title.isin("Toy Story (1995)","Four")).show(3)

+--------------+
|(id IN (2, 3))|
+--------------+
|         false|
|          true|
|          true|
|         false|
+--------------+
only showing top 4 rows

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
+-----------+---+--------------------+--------------------+----------------+

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
+-----------+---+--------------------+--------------------+----------------+



### like

In [87]:
# Show id and title is TRUE if title is like "Toy Story (1995)"
df1.select("id", df1.title.like("%Story%")).show(3)
df1.filter(df1.title.like("%T%St__y_(%")).select('id','title').show(3,truncate=False)

+---+------------------+
| id|title LIKE %Story%|
+---+------------------+
|  1|              true|
|  2|             false|
|  3|             false|
+---+------------------+
only showing top 3 rows

+----+-------------------------------------------------+
|id  |title                                            |
+----+-------------------------------------------------+
|1   |Toy Story (1995)                                 |
|308 |FairyTale: A True Story (1997)                   |
|1653|Entertaining Angels: The Dorothy Day Story (1996)|
+----+-------------------------------------------------+



### startswith, endswith, contains

In [45]:
# Show title starting in "T" as True
df1.select("id",df1.title.startswith("T")).show(2)

# Show title ending in "Story (1995)" as True
df1.select("id",df1.title.endswith("Story (1995)")).show(2)

# Show title containing in "Story" as True
df1.select("id",df1.title.contains("Story")).show(2)


# filter
df1.filter(df1.title.startswith("T")).show(2)
df1.filter(df1.title.endswith("(1995)")).show(2)
df1.filter(df1.title.contains("Story")).show(2)

+---+--------------------+
| id|startswith(title, T)|
+---+--------------------+
|  1|                true|
|  2|               false|
+---+--------------------+
only showing top 2 rows

+---+-----------------------------+
| id|endswith(title, Story (1995))|
+---+-----------------------------+
|  1|                         true|
|  2|                        false|
+---+-----------------------------+
only showing top 2 rows

+---+----------------------+
| id|contains(title, Story)|
+---+----------------------+
|  1|                  true|
|  2|                 false|
+---+----------------------+
only showing top 2 rows

+-----------+---+--------------------+--------------------+--------------------+
|       date| id|                link|              rating|               title|
+-----------+---+--------------------+--------------------+--------------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|    Toy Story (1995)|
|01-Jan-1995|  7|http://us.imdb.co...|[0, 0, 0, 0

### substring

In [85]:
# Return substrings of title
df1.select(df1.title.substr(1, 3).alias("name")).show(2)
df1.filter(df1.title.substr(1, 3)=="Toy").show(2)

+----+
|name|
+----+
| Toy|
| Gol|
+----+
only showing top 2 rows

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
+-----------+---+--------------------+--------------------+----------------+



### between

In [86]:
# Show id: values are TRUE if between 2 and 3
df1.select(df1.id.between(2, 3)).show(3)
df1.filter(df1.id.between(2, 3)).show(3)

+-------------------------+
|((id >= 2) AND (id <= 3))|
+-------------------------+
|                    false|
|                     true|
|                     true|
+-------------------------+
only showing top 3 rows

+-----------+---+--------------------+--------------------+-----------------+
|       date| id|                link|              rating|            title|
+-----------+---+--------------------+--------------------+-----------------+
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...| GoldenEye (1995)|
|01-Jan-1995|  3|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Four Rooms (1995)|
+-----------+---+--------------------+--------------------+-----------------+



### isNull and isNotNull

In [51]:
df4.select(df4.id.isNull()).show(2)
df4.filter(df4.id.isNull()).show(2)

+------------+
|(id IS NULL)|
+------------+
|       false|
|        true|
+------------+
only showing top 2 rows

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|null|      Tom|    null|
|null|    Jerry|     Fox|
+----+---------+--------+
only showing top 2 rows



In [55]:
df4.select(df4.id.isNotNull()).show(2)
df4.filter(df4.id.isNotNull()).show(2)

+----------------+
|(id IS NOT NULL)|
+----------------+
|            true|
|           false|
+----------------+
only showing top 2 rows

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  1|     null|   Brown|
|  3|   Joshua|Peterson|
+---+---------+--------+
only showing top 2 rows



### rlike

In [68]:
df1.select(df1.date.rlike("95$")).show(2)
df1.filter(df1.date.rlike("95$")).show(2)

+--------------+
|date RLIKE 95$|
+--------------+
|          true|
|          true|
+--------------+
only showing top 2 rows

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|
+-----------+---+--------------------+--------------------+----------------+
only showing top 2 rows



### explode

In [49]:
# Show all entries adn explode the list of rating
df1.select("id","title",F.explode("rating").alias("rating")).show(20)

+---+----------------+------+
| id|           title|rating|
+---+----------------+------+
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     1|
|  1|Toy Story (1995)|     1|
|  1|Toy Story (1995)|     1|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  1|Toy Story (1995)|     0|
|  2|GoldenEye (1995)|     0|
+---+----------------+------+
only showing top 20 rows



In [63]:
# Show all entries adn explode the list of rating and select only title from title_year
df3.select("id",col("title_year.title").alias("title"),F.explode("rating").alias("rating")) \
    .select("id","rating","title").show(20)

+---+------+---------+
| id|rating|    title|
+---+------+---------+
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     1|Toy Story|
|  1|     1|Toy Story|
|  1|     1|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  1|     0|Toy Story|
|  2|     0|GoldenEye|
+---+------+---------+
only showing top 20 rows



### collect_list

In [158]:
df15.select(F.collect_list(df15.x),F.collect_list(df15.z)).show()

+---------------+---------------+
|collect_list(x)|collect_list(z)|
+---------------+---------------+
|   [1, 2, 3, 1]|   [4, 3, 2, 1]|
+---------------+---------------+



### collect_set

In [159]:
df15.select(F.collect_set(df15.x),F.collect_list(df15.z)).show()

+--------------+---------------+
|collect_set(x)|collect_list(z)|
+--------------+---------------+
|     [1, 2, 3]|   [4, 3, 2, 1]|
+--------------+---------------+



### concat

In [160]:
df15.select('x','y',F.concat(df15.x,df15.y,df15.z)).show(2)

+---+---+---------------+
|  x|  y|concat(x, y, z)|
+---+---+---------------+
|  1|  a|            1a4|
|  2|  b|            2b3|
+---+---+---------------+
only showing top 2 rows



### concat_ws

In [161]:
df15.select('x','y',F.concat_ws('_',df15.x,df15.y,df15.z)).show(2)

+---+---+---------------------+
|  x|  y|concat_ws(_, x, y, z)|
+---+---+---------------------+
|  1|  a|                1_a_4|
|  2|  b|                2_b_3|
+---+---+---------------------+
only showing top 2 rows



### create_map

In [162]:
df15.select(F.create_map('x', 'y')).show(2) # 2 cols only

+---------+
|map(x, y)|
+---------+
| [1 -> a]|
| [2 -> b]|
+---------+
only showing top 2 rows



### count

In [163]:
df15.select(F.count(df15.x)).show()

+--------+
|count(x)|
+--------+
|       4|
+--------+



### countDistinct

In [164]:
df15.select(F.countDistinct(df15.x)).show()

+-----------------+
|count(DISTINCT x)|
+-----------------+
|                3|
+-----------------+



### ceil

In [166]:
df15.select('b', F.ceil(df15.b)).show(2)

+---+-------+
|  b|CEIL(b)|
+---+-------+
|4.1|      5|
|3.2|      4|
+---+-------+
only showing top 2 rows



# Queries (Filter)

In [91]:
# Filter entries of id, only keep those records of which the values are >24
print(df1.select("title").filter(df1["id"]>24).show(2))
print(df1.select("title","id").filter(col("id")>24).show(2))
print(df1.select("title").filter(col("id")>24).show(2))
print(df1.filter(col("id")>24).show(2))

#invalid
print(df1.filter("id">24).show(2))

+--------------------+
|               title|
+--------------------+
|Birdcage, The (1996)|
|Brothers McMullen...|
+--------------------+
only showing top 2 rows

None
+--------------------+---+
|               title| id|
+--------------------+---+
|Birdcage, The (1996)| 25|
|Brothers McMullen...| 26|
+--------------------+---+
only showing top 2 rows

None
+--------------------+
|               title|
+--------------------+
|Birdcage, The (1996)|
|Brothers McMullen...|
+--------------------+
only showing top 2 rows

None
+-----------+---+--------------------+--------------------+--------------------+
|       date| id|                link|              rating|               title|
+-----------+---+--------------------+--------------------+--------------------+
|08-Mar-1996| 25|http://us.imdb.co...|[0, 0, 0, 0, 0, 1...|Birdcage, The (1996)|
|01-Jan-1995| 26|http://us.imdb.co...|[0, 0, 0, 0, 0, 1...|Brothers McMullen...|
+-----------+---+--------------------+--------------------+--------

TypeError: '>' not supported between instances of 'str' and 'int'

### logical OR, AND and NOT

In [106]:
df1.filter((col("id")>24) & (df1.rating[1]==1)).show(2)
df1.filter((col("id")>24) & ~(df1.rating[1]==1)).show(2)
df1.filter((col("id")>24) | (df1.rating[1]==1)).show(2)
df1.filter((col("id")>24) | ~(df1.rating[1]==1)).show(2)
df1.select("title").filter((col('id')>100) & (df1.rating[1]==1)).show(2)
df1.select("title","rating").filter("id > 100 AND rating[1] == 1").show(2)

#invalid
# df1.select("title","rating").filter("id>100 & rating[1]==1").show(2)
# df1.select("title").filter(col("id")>24 & df1.rating[1]==1).show(2)

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995| 27|http://us.imdb.co...|[0, 1, 0, 0, 0, 0...| Bad Boys (1995)|
|01-Jan-1995| 28|http://us.imdb.co...|[0, 1, 0, 0, 0, 0...|Apollo 13 (1995)|
+-----------+---+--------------------+--------------------+----------------+
only showing top 2 rows

+-----------+---+--------------------+--------------------+--------------------+
|       date| id|                link|              rating|               title|
+-----------+---+--------------------+--------------------+--------------------+
|08-Mar-1996| 25|http://us.imdb.co...|[0, 0, 0, 0, 0, 1...|Birdcage, The (1996)|
|01-Jan-1995| 26|http://us.imdb.co...|[0, 0, 0, 0, 0, 1...|Brothers McMullen...|
+-----------+---+--------------------+--------------------+--------------------+
only showing top 2 rows

+-

# Queries (Where)

In [141]:
df1.where(df1.id>100).show(2)

+-----------+---+--------------------+--------------------+--------------------+
|       date| id|                link|              rating|               title|
+-----------+---+--------------------+--------------------+--------------------+
|08-Mar-1981|101|http://us.imdb.co...|[0, 1, 1, 1, 0, 0...|  Heavy Metal (1981)|
|01-Jan-1970|102|http://us.imdb.co...|[0, 0, 0, 1, 1, 0...|Aristocats, The (...|
+-----------+---+--------------------+--------------------+--------------------+
only showing top 2 rows



In [104]:
df1.where((col('id')>100) & (df1.title.like("H%"))).show(2)
df1.where((col('id')>100) & ~(df1.title.like("H%"))).show(2)
df1.where((col('id')>100) | (df1.title.like("H%"))).show(2)
df1.where((col('id')>100) | ~(df1.title.like("H%"))).show(2)
df1.select("title").where((col('id')>100) & (df1.rating[1]==1)).show(2)
df1.select("title","rating").where("id>100 AND rating[1]==1").show(2)

#invalid
# df1.select("title","rating").where("id>100 & rating[1]==1").show(2)
# df1.where(col('id')>100 & df1.rating[1]==1).show(2)

+-----------+---+--------------------+--------------------+--------------------+
|       date| id|                link|              rating|               title|
+-----------+---+--------------------+--------------------+--------------------+
|08-Mar-1981|101|http://us.imdb.co...|[0, 1, 1, 1, 0, 0...|  Heavy Metal (1981)|
|19-Apr-1996|113|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Horseman on the R...|
+-----------+---+--------------------+--------------------+--------------------+
only showing top 2 rows

+-----------+---+--------------------+--------------------+--------------------+
|       date| id|                link|              rating|               title|
+-----------+---+--------------------+--------------------+--------------------+
|01-Jan-1970|102|http://us.imdb.co...|[0, 0, 0, 1, 1, 0...|Aristocats, The (...|
|29-Mar-1996|103|http://us.imdb.co...|[0, 0, 0, 1, 1, 0...|All Dogs Go to He...|
+-----------+---+--------------------+--------------------+--------------------+
onl

# Queries (When)

In [83]:
# Show firstName and 0 or 1 depending on id>2
df1.select("title",F.when(df1.id == 2, "Yes").when(df1.id == 3, "OK").otherwise("No")).show(3)

+-----------------+-------------------------------------------------------------+
|            title|CASE WHEN (id = 2) THEN Yes WHEN (id = 3) THEN OK ELSE No END|
+-----------------+-------------------------------------------------------------+
| Toy Story (1995)|                                                           No|
| GoldenEye (1995)|                                                          Yes|
|Four Rooms (1995)|                                                           OK|
+-----------------+-------------------------------------------------------------+
only showing top 3 rows



# Queries (GroupBy)

In [107]:
# Group by title_year.year, count the members in the groups
df3.groupBy("title_year.year").count().show(3)

+----+-----+
|year|count|
+----+-----+
|1953|    2|
|1957|    8|
|1987|   14|
+----+-----+
only showing top 3 rows



In [108]:
df3.groupBy("title_year.year").agg(F.collect_list(col("title_year.title"))).show(3)

+----+------------------------------+
|year|collect_list(title_year.title)|
+----+------------------------------+
|1953|          [Roman Holiday, B...|
|1957|          [12 Angry Men, Br...|
|1987|          [Dirty Dancing, P...|
+----+------------------------------+
only showing top 3 rows



### mathematical Operations

In [49]:
df3.groupBy("title_year.year").agg(
    F.format_string("%04d", F.min("id")).alias("min_id_padded"),
    F.max("id").alias("max_id"),
    F.ceil(F.avg(df3.id)).alias("avg_ceil"),
    F.floor(F.avg(df3.id)).alias("avg_floor"),
    F.round(F.avg(df3.id),2).alias("avg_round"),
    F.format_number(F.avg(df3.id),3).alias('avg_round2'),
    F.lpad(F.avg(df3.id),8,'0').alias('lpad')
).show(3,truncate=False)

+----+-------------+------+--------+---------+---------+----------+--------+
|year|min_id_padded|max_id|avg_ceil|avg_floor|avg_round|avg_round2|lpad    |
+----+-------------+------+--------+---------+---------+----------+--------+
|1953|0487         |1298  |893     |892      |892.5    |892.500   |000892.5|
|1957|0178         |1269  |661     |660      |660.5    |660.500   |000660.5|
|1987|0155         |1576  |743     |742      |742.79   |742.786   |742.7857|
+----+-------------+------+--------+---------+---------+----------+--------+
only showing top 3 rows



In [109]:
print(df3.agg(F.max(df3.id)).head())
print(df3.agg(F.max(df3.id)).head()[0])
print(df3.agg(F.max(df3.id).alias("max")).head()["max"])
print(df3.agg(F.max(df3.id).alias("max")).collect())
print(df3.agg(F.max(df3.id).alias("max")).collect()[0]["max"])

Row(max(id)=1682)
1682
1682
[Row(max=1682)]
1682


# Queries (Pivot)

In [82]:
df5 = spark.createDataFrame( [('a',1), ('a',2), ('a',3),
                              ('a',1), ('b',1), ('b',2)], ('key', 'val') )
print(df5.show())
print(
    df5.groupBy('key').pivot('val').count().show()
)
print(
    df5.groupBy('key').pivot('val').sum('val').show()
)

+---+---+
|key|val|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  a|  1|
|  b|  1|
|  b|  2|
+---+---+

None
+---+---+---+----+
|key|  1|  2|   3|
+---+---+---+----+
|  b|  1|  1|null|
|  a|  2|  1|   1|
+---+---+---+----+

None
+---+---+---+----+
|key|  1|  2|   3|
+---+---+---+----+
|  b|  1|  2|null|
|  a|  2|  2|   3|
+---+---+---+----+

None


# Queries (Sample, limit)

In [47]:
df1.limit(2).toPandas()

Unnamed: 0,date,id,link,rating,title
0,01-Jan-1995,1,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Toy Story (1995)
1,01-Jan-1995,2,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",GoldenEye (1995)


In [112]:
df6 = spark.createDataFrame( [('a',1), ('a',2), ('a',3),
                           ('b',1), ('b',2)], ('key', 'val') )
df6.show(vertical=True)

-RECORD 0--
 key | a   
 val | 1   
-RECORD 1--
 key | a   
 val | 2   
-RECORD 2--
 key | a   
 val | 3   
-RECORD 3--
 key | b   
 val | 1   
-RECORD 4--
 key | b   
 val | 2   



In [113]:
df6.sample(False,1.0,81).show()

+---+---+
|key|val|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  b|  1|
|  b|  2|
+---+---+



In [114]:
df6.sample(True,1.0,81).show()

+---+---+
|key|val|
+---+---+
|  a|  2|
|  a|  2|
|  b|  1|
|  b|  2|
+---+---+



In [175]:
df6.sample(False,0.50,5).show()

+---+---+
|key|val|
+---+---+
|  a|  3|
|  b|  1|
|  b|  2|
+---+---+



In [115]:
df6.sample(True,0.50,5).show()

+---+---+
|key|val|
+---+---+
|  a|  1|
|  a|  3|
|  b|  1|
|  b|  1|
+---+---+



# Queries (SampleBy)

In [117]:
df6.sampleBy("key",{"a":1.0},5).show()

+---+---+
|key|val|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
+---+---+



In [118]:
df6.sampleBy("key",{"a":1.0,"b":0.30},81).show()

+---+---+
|key|val|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  b|  2|
+---+---+



# Cube
In the below statement we have applied cube, count, and sort function together on the columns which generate grand total cases including Null values.

cube function takes a list of column names and returns possible combinations of grouping columns. We can apply aggregations functions( sum,count,min,max,etc) on the combinations to generate useful information.

In [66]:
df13=spark.createDataFrame([("Chocolate",2),("Kurkure",5),("Sheets",20),
                          ("Kurkure",20),("Chocolate",10),("Chocolate",5)],
                         ["Item_Name","Quantity"])
df13.show()
df13.cube("Item_Name",df13["Quantity"]).count().sort("Item_Name","Quantity").show()

+---------+--------+
|Item_Name|Quantity|
+---------+--------+
|Chocolate|       2|
|  Kurkure|       5|
|   Sheets|      20|
|  Kurkure|      20|
|Chocolate|      10|
|Chocolate|       5|
+---------+--------+

+---------+--------+-----+
|Item_Name|Quantity|count|
+---------+--------+-----+
|     null|    null|    6|
|     null|       2|    1|
|     null|       5|    2|
|     null|      10|    1|
|     null|      20|    2|
|Chocolate|    null|    3|
|Chocolate|       2|    1|
|Chocolate|       5|    1|
|Chocolate|      10|    1|
|  Kurkure|    null|    2|
|  Kurkure|       5|    1|
|  Kurkure|      20|    1|
|   Sheets|    null|    1|
|   Sheets|      20|    1|
+---------+--------+-----+



In [67]:
df13.cube("Item_Name",df13["Quantity"]).sum().sort("Item_Name","Quantity").show()

+---------+--------+-------------+
|Item_Name|Quantity|sum(Quantity)|
+---------+--------+-------------+
|     null|    null|           62|
|     null|       2|            2|
|     null|       5|           10|
|     null|      10|           10|
|     null|      20|           40|
|Chocolate|    null|           17|
|Chocolate|       2|            2|
|Chocolate|       5|            5|
|Chocolate|      10|           10|
|  Kurkure|    null|           25|
|  Kurkure|       5|            5|
|  Kurkure|      20|           20|
|   Sheets|    null|           20|
|   Sheets|      20|           20|
+---------+--------+-------------+



# rollup
rollup returns the subset of rows returned by the cube. It takes a list of column names as input and finds the possible combinations. We can apply the aggregate function to extract the needed information. The extracted rows are less in number but actually worth using.

In [69]:
df13.rollup("Item_Name",df13["Quantity"]).count().sort("Item_Name","Quantity").show()

+---------+--------+-----+
|Item_Name|Quantity|count|
+---------+--------+-----+
|     null|    null|    6|
|Chocolate|    null|    3|
|Chocolate|       2|    1|
|Chocolate|       5|    1|
|Chocolate|      10|    1|
|  Kurkure|    null|    2|
|  Kurkure|       5|    1|
|  Kurkure|      20|    1|
|   Sheets|    null|    1|
|   Sheets|      20|    1|
+---------+--------+-----+



# Add, Update & Remove Columns (Adding Columns)

In [119]:
df1.withColumn('new_column', F.lit(None).cast(StringType())).show(2)

+-----------+---+--------------------+--------------------+----------------+----------+
|       date| id|                link|              rating|           title|new_column|
+-----------+---+--------------------+--------------------+----------------+----------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|      null|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|      null|
+-----------+---+--------------------+--------------------+----------------+----------+
only showing top 2 rows



In [120]:
df1.withColumn('new_column', F.lit('xyz')).show(2)

+-----------+---+--------------------+--------------------+----------------+----------+
|       date| id|                link|              rating|           title|new_column|
+-----------+---+--------------------+--------------------+----------------+----------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|       xyz|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|       xyz|
+-----------+---+--------------------+--------------------+----------------+----------+
only showing top 2 rows



In [29]:
df1.withColumn('new_column', 1.4 * F.col('id')).show(2)
df1.withColumn('new_column', F.expr("1.4 * id")).show(2)

+-----------+---+--------------------+--------------------+----------------+----------+
|       date| id|                link|              rating|           title|new_column|
+-----------+---+--------------------+--------------------+----------------+----------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|       1.4|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|       2.8|
+-----------+---+--------------------+--------------------+----------------+----------+
only showing top 2 rows

+-----------+---+--------------------+--------------------+----------------+----------+
|       date| id|                link|              rating|           title|new_column|
+-----------+---+--------------------+--------------------+----------------+----------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|       1.4|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|       2.8|
+------

In [122]:
df3.withColumn("title",df3.title_year.title).show()

+---+--------------------+----------+--------------------+--------------------+
| id|          title_year|      date|              rating|               title|
+---+--------------------+----------+--------------------+--------------------+
|  1|   [Toy Story, 1995]|1995-01-01|[0, 0, 0, 1, 1, 1...|           Toy Story|
|  2|   [GoldenEye, 1995]|1995-01-01|[0, 1, 1, 0, 0, 0...|           GoldenEye|
|  3|  [Four Rooms, 1995]|1995-01-01|[0, 0, 0, 0, 0, 0...|          Four Rooms|
|  4|  [Get Shorty, 1995]|1995-01-01|[0, 1, 0, 0, 0, 1...|          Get Shorty|
|  5|     [Copycat, 1995]|1995-01-01|[0, 0, 0, 0, 0, 0...|             Copycat|
|  6|[Shanghai Triad (...|1995-01-01|[0, 0, 0, 0, 0, 0...|Shanghai Triad (Y...|
|  7|[Twelve Monkeys, ...|1995-01-01|[0, 0, 0, 0, 0, 0...|      Twelve Monkeys|
|  8|        [Babe, 1995]|1995-01-01|[0, 0, 0, 0, 1, 1...|                Babe|
|  9|[Dead Man Walking...|1995-01-01|[0, 0, 0, 0, 0, 0...|    Dead Man Walking|
| 10| [Richard III, 1995]|1996-01-22|[0,

In [123]:
df3.withColumn("rating",F.explode(df3.rating)).show()

+---+-----------------+----------+------+
| id|       title_year|      date|rating|
+---+-----------------+----------+------+
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     1|
|  1|[Toy Story, 1995]|1995-01-01|     1|
|  1|[Toy Story, 1995]|1995-01-01|     1|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  1|[Toy Story, 1995]|1995-01-01|     0|
|  2|[GoldenEye, 1995]|1995-01-01|     0|
+---+-----------------+----------+

# Add, Update & Remove Columns (Updating Columns)

In [124]:
df3.withColumnRenamed('title_year', 'title').show(2)

+---+-----------------+----------+--------------------+
| id|            title|      date|              rating|
+---+-----------------+----------+--------------------+
|  1|[Toy Story, 1995]|1995-01-01|[0, 0, 0, 1, 1, 1...|
|  2|[GoldenEye, 1995]|1995-01-01|[0, 1, 1, 0, 0, 0...|
+---+-----------------+----------+--------------------+
only showing top 2 rows



# Add, Update & Remove Columns (Removing Columns)

In [125]:
df1.drop("title", "rating").show(2)

+-----------+---+--------------------+
|       date| id|                link|
+-----------+---+--------------------+
|01-Jan-1995|  1|http://us.imdb.co...|
|01-Jan-1995|  2|http://us.imdb.co...|
+-----------+---+--------------------+
only showing top 2 rows



In [126]:
df1.drop(df1.title).drop(df1.rating).show(2)

+-----------+---+--------------------+
|       date| id|                link|
+-----------+---+--------------------+
|01-Jan-1995|  1|http://us.imdb.co...|
|01-Jan-1995|  2|http://us.imdb.co...|
+-----------+---+--------------------+
only showing top 2 rows



# Sort

In [127]:
df1.sort(df1.date.desc()).show(3)
# df1.sort("date",ascending=False).show(3)

df1.sort(df1.date.desc(),df1.id.asc()).show(3)
# df1.sort(df1.date,df1.id,ascending=[False,True]).show(3)
# df1.sort(df1.date,df1.id,ascending=[0,1]).show(3)

+-----------+----+--------------------+--------------------+-------------------+
|       date|  id|                link|              rating|              title|
+-----------+----+--------------------+--------------------+-------------------+
| 4-Feb-1971|1373|http://us.imdb.co...|[1, 0, 0, 0, 0, 0...|Good Morning (1971)|
|31-May-1996| 819|http://us.imdb.co...|[0, 0, 0, 0, 0, 1...|       Eddie (1996)|
|31-May-1996| 472|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...| Dragonheart (1996)|
+-----------+----+--------------------+--------------------+-------------------+
only showing top 3 rows

+-----------+----+--------------------+--------------------+-------------------+
|       date|  id|                link|              rating|              title|
+-----------+----+--------------------+--------------------+-------------------+
| 4-Feb-1971|1373|http://us.imdb.co...|[1, 0, 0, 0, 0, 0...|Good Morning (1971)|
|31-May-1996| 472|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...| Dragonheart (1996)|
|31

# OrderBy

In [128]:
df1.orderBy(df1.date.desc()).show(3)
# df1.orderBy("date",ascending=False).show(3)

df1.orderBy(df1.date.desc(),df1.id.asc()).show(3)
# df1.orderBy(df1.date,df1.id,ascending=[False,True]).show(3)
# df1.orderBy(df1.date,df1.id,ascending=[0,1]).show(3)

+-----------+----+--------------------+--------------------+-------------------+
|       date|  id|                link|              rating|              title|
+-----------+----+--------------------+--------------------+-------------------+
| 4-Feb-1971|1373|http://us.imdb.co...|[1, 0, 0, 0, 0, 0...|Good Morning (1971)|
|31-May-1996| 819|http://us.imdb.co...|[0, 0, 0, 0, 0, 1...|       Eddie (1996)|
|31-May-1996| 472|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...| Dragonheart (1996)|
+-----------+----+--------------------+--------------------+-------------------+
only showing top 3 rows

+-----------+----+--------------------+--------------------+-------------------+
|       date|  id|                link|              rating|              title|
+-----------+----+--------------------+--------------------+-------------------+
| 4-Feb-1971|1373|http://us.imdb.co...|[1, 0, 0, 0, 0, 0...|Good Morning (1971)|
|31-May-1996| 472|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...| Dragonheart (1996)|
|31

# Missing & Replacing Values

In [129]:
# Replace null values
df4.na.fill(50).show()

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  1|     null|   Brown|
| 50|      Tom|    null|
| 50|    Jerry|     Fox|
| 50|    Jerry|     Fox|
|  3|   Joshua|Peterson|
|  3|   Joshua|Peterson|
| 50|     null|    null|
+---+---------+--------+



In [39]:
# Replace null values of different data types
df4.na.fill(50).na.fill("hello",subset=["lastName"]).show()
df4.na.fill(50).na.fill("hello",subset=["lastname"]).show(2)

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  1|     null|   Brown|
| 50|      Tom|   hello|
| 50|    Jerry|     Fox|
| 50|    Jerry|     Fox|
|  3|   Joshua|Peterson|
|  3|   Joshua|Peterson|
| 50|     null|   hello|
+---+---------+--------+

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  1|     null|   Brown|
| 50|      Tom|   hello|
+---+---------+--------+
only showing top 2 rows



In [131]:
# Return new df omitting rows with null values
df4.na.drop().show()
df4.na.drop(how="any").show()

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  3|   Joshua|Peterson|
|  3|   Joshua|Peterson|
+---+---------+--------+

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  3|   Joshua|Peterson|
|  3|   Joshua|Peterson|
+---+---------+--------+



In [132]:
df4.na.drop(how="all").show()

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|   1|     null|   Brown|
|null|      Tom|    null|
|null|    Jerry|     Fox|
|null|    Jerry|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
+----+---------+--------+



In [136]:
df4.na.drop(how="any",subset=["firstName","lastName"]).show()
df4.na.drop(how="all",subset=["firstName","lastName"]).show()

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|null|    Jerry|     Fox|
|null|    Jerry|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
+----+---------+--------+

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|   1|     null|   Brown|
|null|      Tom|    null|
|null|    Jerry|     Fox|
|null|    Jerry|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
+----+---------+--------+



In [139]:
# thresh=2 = drop rows with 2 nulls or more
# drop rows that have less than `thresh` non-null values.
#     This overwrites the `how` parameter.
df4.na.drop(how="any",subset=["lastName"],thresh=1).show()
df4.na.drop(how="all",subset=["lastName"],thresh=1).show()
df4.na.drop(how="any",subset=["lastName"],thresh=2).show()
df4.na.drop(how="all",subset=["lastName"],thresh=2).show()

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|   1|     null|   Brown|
|null|    Jerry|     Fox|
|null|    Jerry|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
+----+---------+--------+

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|   1|     null|   Brown|
|null|    Jerry|     Fox|
|null|    Jerry|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
+----+---------+--------+

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
+---+---------+--------+

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
+---+---------+--------+



In [141]:
# Return new df replacing one value with another
df4.na.replace(1, 20).replace("Jerry", "bye").show()

+----+---------+--------+
|  id|firstName|lastName|
+----+---------+--------+
|  20|     null|   Brown|
|null|      Tom|    null|
|null|      bye|     Fox|
|null|      bye|     Fox|
|   3|   Joshua|Peterson|
|   3|   Joshua|Peterson|
|null|     null|    null|
+----+---------+--------+



# Data Structures

In [142]:
df1.rdd.collect() # Convert df into an RDD

[Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', rating=[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], title='Toy Story (1995)'),
 Row(date='01-Jan-1995', id=2, link='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', rating=[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], title='GoldenEye (1995)'),
 Row(date='01-Jan-1995', id=3, link='http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)', rating=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], title='Four Rooms (1995)'),
 Row(date='01-Jan-1995', id=4, link='http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)', rating=[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], title='Get Shorty (1995)'),
 Row(date='01-Jan-1995', id=5, link='http://us.imdb.com/M/title-exact?Copycat%20(1995)', rating=[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], title='Copycat (1995)'),
 Row(date='01-Jan-1995', id=6, link='http://us.imdb.com/Title?Yao+a+yao+ya

In [143]:
df1.toJSON().first() # Convert df into a RDD of string

'{"date":"01-Jan-1995","id":1,"link":"http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)","rating":[0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0],"title":"Toy Story (1995)"}'

In [144]:
df1.toPandas() # Return the contents of df as Pandas DataFrame

Unnamed: 0,date,id,link,rating,title
0,01-Jan-1995,1,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Toy Story (1995)
1,01-Jan-1995,2,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",GoldenEye (1995)
2,01-Jan-1995,3,http://us.imdb.com/M/title-exact?Four%20Rooms%...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Four Rooms (1995)
3,01-Jan-1995,4,http://us.imdb.com/M/title-exact?Get%20Shorty%...,"[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Get Shorty (1995)
4,01-Jan-1995,5,http://us.imdb.com/M/title-exact?Copycat%20(1995),"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",Copycat (1995)
...,...,...,...,...,...
1677,06-Feb-1998,1678,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",Mat' i syn (1997)
1678,06-Feb-1998,1679,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",B. Monkey (1998)
1679,01-Jan-1998,1680,http://us.imdb.com/Title?Sliding+Doors+(1998),"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ...",Sliding Doors (1998)
1680,01-Jan-1994,1681,http://us.imdb.com/M/title-exact?You%20So%20Cr...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",You So Crazy (1994)


# Repartitioning

In [145]:
# df with 10 partitions
df1.repartition(10).rdd.getNumPartitions()

10

In [146]:
# df with 1 partition
df1.coalesce(1).rdd.getNumPartitions() 

1

# Registering DataFrames as Views

In [154]:
# Global permanent/non-replaceable Temp view
df1.createGlobalTempView("df1")

In [148]:
# Global replaceable Temp
df2.createOrReplaceGlobalTempView("df2")

In [155]:
# non-Global permanent/non-replaceable Temp view
df3.createTempView("df3")

In [150]:
# non-Global permanent/non-replaceable Temp view
df4.createOrReplaceTempView("df4")

# Query Views

In [162]:
# Access global with new session
spark.sql("SELECT * FROM global_temp.df1").show(2)
spark.newSession().sql("SELECT * FROM global_temp.df1").show(2)

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|
+-----------+---+--------------------+--------------------+----------------+
only showing top 2 rows

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|
+-----------+---+--------------------+--------------------+----------------+
only showing top 2 rows



In [163]:
# Access global with same session
spark.sql("SELECT * FROM global_temp.df1").show(2)
spark.newSession().sql("SELECT * FROM global_temp.df2").show(2)

+-----------+---+--------------------+--------------------+----------------+
|       date| id|                link|              rating|           title|
+-----------+---+--------------------+--------------------+----------------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|
+-----------+---+--------------------+--------------------+----------------+
only showing top 2 rows

+---+----------------+----------+--------------------+
| id|           title|      date|              rating|
+---+----------------+----------+--------------------+
|  1|Toy Story (1995)|1995-01-01|[0, 0, 0, 1, 1, 1...|
|  2|GoldenEye (1995)|1995-01-01|[0, 1, 1, 0, 0, 0...|
+---+----------------+----------+--------------------+
only showing top 2 rows



In [164]:
# Access local with same session
spark.sql("SELECT * FROM df3").show(3)

# Access local with new session
spark.newSession().sql("SELECT * FROM df3").show(3)

+---+------------------+----------+--------------------+
| id|        title_year|      date|              rating|
+---+------------------+----------+--------------------+
|  1| [Toy Story, 1995]|1995-01-01|[0, 0, 0, 1, 1, 1...|
|  2| [GoldenEye, 1995]|1995-01-01|[0, 1, 1, 0, 0, 0...|
|  3|[Four Rooms, 1995]|1995-01-01|[0, 0, 0, 0, 0, 0...|
+---+------------------+----------+--------------------+
only showing top 3 rows



AnalysisException: 'Table or view not found: df3; line 1 pos 14'

# Write & Save to Files

1. "errorifexists" or "error" is default, if data exists
2. "append" data is added to new file with or without coalesce
3. "overwrite" overwrites the data
4. "ignore" data is not written even if data is new, old remains intact
---
1. partitionBy divides data into folders by the partition names as partitionBy col, but partitionBy col is not saved to file. used by save and saveAsTable 
2. bucketBy with saveAsTable
3. sortBy with saveAsTable, must be used together with bucketBy; but first bucket it then sort it

In [169]:
df1.select("id", "title").write.mode('error').save("save/dfdemo1.parquet")

In [66]:
df1.coalesce(1).select("id", "title","date").write.mode("append").save("save/dfdemo2.json",format="json")

In [184]:
df3.select(col("title_year.year").alias("year"),col("title_year.title").alias("title")).coalesce(1).write.format("json").mode(
    "overwrite").partitionBy("year").save("save/dfdemo4.json")

In [179]:
df4.coalesce(1).write.mode("ignore").format("csv").save("save/dfdemo5.csv")
df4.coalesce(1).write.mode("ignore").option("header", "true").format("csv").option("header", "true").save("save/dfdemo6.csv")
df4.coalesce(1).write.option("header", "true").mode("ignore").option("header", "true").format("csv").save("save/dfdemo7.csv")

In [192]:
df3.select(col("title_year.year").alias("year"),col("title_year.title").alias("title")).coalesce(1
    ).write.bucketBy(5, "year").mode("overwrite").saveAsTable("yearwise_bucked")
spark.sql("SELECT * FROM yearwise_bucked").show(100)

+----+--------------------+
|year|               title|
+----+--------------------+
|1967|       Belle de jour|
|1992|             Aladdin|
|1991|Terminator 2: Jud...|
|1991|Silence of the La...|
|1992|            Supercop|
|1968|2001: A Space Ody...|
|1992|  Lawnmower Man, The|
|1987|       Dirty Dancing|
|1992|      Reservoir Dogs|
|1992|      Basic Instinct|
|1992| Glengarry Glen Ross|
|1980|    Private Benjamin|
|1991|        Delicatessen|
|1980|Empire Strikes Ba...|
|1987| Princess Bride, The|
|1980| Blues Brothers, The|
|1987|   Full Metal Jacket|
|1992|    Grand Day Out, A|
|1980|         Raging Bull|
|1967|       Graduate, The|
|1980|        Shining, The|
|1987|        Evil Dead II|
|1992|          Unforgiven|
|1992|Bram Stoker's Dra...|
|1991|           Cape Fear|
|1991|Star Trek VI: The...|
|1992|      Batman Returns|
|1992|         Under Siege|
|1987|     Raising Arizona|
|1992|            Sneakers|
|1992|Last of the Mohic...|
|1997|       Jungle2Jungle|
|1997|Smilla's Sense

In [194]:
df3.select(col("title_year.year").alias("year"),col("title_year.title").alias("title")).coalesce(1
    ).write.bucketBy(5, "year").sortBy("title").mode("overwrite").saveAsTable("title_sorted")
spark.sql("SELECT * FROM title_sorted").show(100)

+----+--------------------+
|year|               title|
+----+--------------------+
|1997|  'Til There Was You|
|1997|                 187|
|1968|2001: A Space Ody...|
|1963|               8 1/2|
|1997|8 Heads in a Duff...|
|1997|      Absolute Power|
|1997|    Addicted to Love|
|1997|           Afterglow|
|1997|             Air Bud|
|1997|       Air Force One|
|1992|             Aladdin|
|1992|             Alien 3|
|1997| Alien: Resurrection|
|1997|         All Over Me|
|1997|             Amistad|
|1992|Amityville 1992: ...|
|1997|            Anaconda|
|1997|           Anastasia|
|1997|       Anna Karenina|
|1997|        Apostle, The|
|1997|  As Good As It Gets|
|1997|     Assignment, The|
|1997|Austin Powers: In...|
|1997|Ayn Rand: A Sense...|
|1997|             B*A*P*S|
|1987|           Bad Taste|
|1968|          Barbarella|
|1992|      Basic Instinct|
|1997|      Batman & Robin|
|1992|      Batman Returns|
|1997|                Bean|
|1997|Beautician and th...|
|1991|Beauty and the

In [17]:
# from database
# restart jupyter if it doesn't work
user='root';pw='24081999'
table_name='baz1' 
# creates new table, doesn't write to same table without overwrite
# table schema must be same as DF schema
# appends with same id's, if MySQL official schema in db is not UNIQUE

# mysql://[user[:password]@][netloc][:port][/dbname][?param1=value1&...]
url="jdbc:mysql://localhost:3306/oneqshopv1"
properties = {
    "user": user,
    "password": pw
}
df_db1.write.option("driver", "com.mysql.jdbc.Driver").mode("append").jdbc(url=url,table=table_name,properties=properties)
# df_db1.show()

In [268]:
# invalid, saves as parquet without format definition
df1.select("id", "title").write.save("save/dfdemo3.json")

# Joins

In [74]:
df7 = spark.createDataFrame( [('a',1), ('b',2), ('c',3)], ('x1', 'x2') )
print(df7.show())

df8 = spark.createDataFrame( [('a',True), ('b',False), ('d',True)], ('x1', 'x3') )
print(df8.show())

+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+

None
+---+-----+
| x1|   x3|
+---+-----+
|  a| true|
|  b|false|
|  d| true|
+---+-----+

None


## Mutating Joins

In [196]:
# Left
df7.join(df8, "x1", how="left").orderBy("x1",ascending=True).show()

+---+---+-----+
| x1| x2|   x3|
+---+---+-----+
|  a|  1| true|
|  b|  2|false|
|  c|  3| null|
+---+---+-----+



In [197]:
# Right
df7.join(df8, "x1", how="right").orderBy("x1",ascending=True).show()

+---+----+-----+
| x1|  x2|   x3|
+---+----+-----+
|  a|   1| true|
|  b|   2|false|
|  d|null| true|
+---+----+-----+



In [198]:
# inner
df7.join(df8, "x1", how="inner").orderBy("x1",ascending=True).show()

+---+---+-----+
| x1| x2|   x3|
+---+---+-----+
|  a|  1| true|
|  b|  2|false|
+---+---+-----+



In [199]:
# full
df7.join(df8, "x1", how="full").orderBy("x1",ascending=True).show()

+---+----+-----+
| x1|  x2|   x3|
+---+----+-----+
|  a|   1| true|
|  b|   2|false|
|  c|   3| null|
|  d|null| true|
+---+----+-----+



## Filtering Joins

In [201]:
# left_semi
df7.join(df8, "x1", how="left_semi").orderBy("x1",ascending=True).show()

+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  b|  2|
+---+---+



In [200]:
# left_anti
df7.join(df8, "x1", how="left_anti").orderBy("x1",ascending=True).show()

+---+---+
| x1| x2|
+---+---+
|  c|  3|
+---+---+



# Dataframe Operations

In [40]:
df9 = spark.createDataFrame( [('a',1), ('b',2), ('c',3)], ('x1', 'x2') )
print(df9.show())

df10 = spark.createDataFrame( [('b',2), ('c',3), ('d',4)], ('x1', 'x2') )
print(df10.show())

df16 = spark.createDataFrame( [(2,'b'), (3,'c'), (4,'d')], ('x2', 'x1') )
print(df16.show())

+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+

None
+---+---+
| x1| x2|
+---+---+
|  b|  2|
|  c|  3|
|  d|  4|
+---+---+

None
+---+---+
| x2| x1|
+---+---+
|  2|  b|
|  3|  c|
|  4|  d|
+---+---+

None


In [42]:
df9.intersect(df10).show()

+---+---+
| x1| x2|
+---+---+
|  c|  3|
|  b|  2|
+---+---+



In [204]:
df9.union(df10).show()

+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
|  b|  2|
|  c|  3|
|  d|  4|
+---+---+



In [45]:
df9.union(df16).show()

+---+---+
|x1 |x2 |
+---+---+
|a  |1  |
|b  |2  |
|c  |3  |
|2  |b  |
|3  |c  |
|4  |d  |
+---+---+



In [41]:
df9.unionByName(df16).show()

+---+---+
| x1| x2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
|  b|  2|
|  c|  3|
|  d|  4|
+---+---+



In [205]:
df9.subtract(df10).show()

+---+---+
| x1| x2|
+---+---+
|  a|  1|
+---+---+



In [91]:
print(df14.corr("x3","x4")) # int and float
print(df14.stat.corr("x3","y1")) # int and int

-0.4513586212383007
0.3333333333333333


In [92]:
print(df14.cov("x3","x4")) # int and float
print(df14.stat.cov("x3","y1")) # int and int

-0.37999999999999984
0.2


In [94]:
# count no of times those were in same row
print(df14.crosstab("x3","x4").show()) # int and float
print(df14.stat.crosstab("x3","y1").show()) # int and int

+-----+---+---+---+---+---+---+
|x3_x4|1.4|1.5|2.1|2.4|2.5|3.5|
+-----+---+---+---+---+---+---+
|    2|  1|  0|  1|  0|  0|  1|
|    4|  0|  1|  0|  0|  0|  0|
|    1|  0|  0|  0|  1|  1|  0|
+-----+---+---+---+---+---+---+

None
+-----+---+---+
|x3_y1|  0|  1|
+-----+---+---+
|    2|  2|  1|
|    4|  0|  1|
|    1|  1|  1|
+-----+---+---+

None


In [206]:
df11 = spark.createDataFrame( [('a',"1,2,3"), ('b',"2,3,4")], ('x1', 'x2') )
df11.show()

+---+-----+
| x1|   x2|
+---+-----+
|  a|1,2,3|
|  b|2,3,4|
+---+-----+



In [207]:
df11_1=df11.select("x1",F.split("x2",",").alias("values"))
df11_1.printSchema()
df11_1.show()

root
 |-- x1: string (nullable = true)
 |-- values: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+---------+
| x1|   values|
+---+---------+
|  a|[1, 2, 3]|
|  b|[2, 3, 4]|
+---+---------+



In [210]:
df11_2=df11.select("x1",F.split("x2",",").alias("values"),
                  F.posexplode(F.split("x2",",")).alias("pos","val"))
df11_2.printSchema()
df11_2.show()

root
 |-- x1: string (nullable = true)
 |-- values: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pos: integer (nullable = false)
 |-- val: string (nullable = true)

+---+---------+---+---+
| x1|   values|pos|val|
+---+---------+---+---+
|  a|[1, 2, 3]|  0|  1|
|  a|[1, 2, 3]|  1|  2|
|  a|[1, 2, 3]|  2|  3|
|  b|[2, 3, 4]|  0|  2|
|  b|[2, 3, 4]|  1|  3|
|  b|[2, 3, 4]|  2|  4|
+---+---------+---+---+



In [211]:
df11_3=df11.select("x1",F.split("x2",",").alias("values"),
        F.posexplode(F.split("x2",",")).alias("pos","val")
        ).drop("val").select("x1",F.expr("values[pos]").alias("val"))
df11_3.printSchema()
df11_3.show()

root
 |-- x1: string (nullable = true)
 |-- val: string (nullable = true)

+---+---+
| x1|val|
+---+---+
|  a|  1|
|  a|  2|
|  a|  3|
|  b|  2|
|  b|  3|
|  b|  4|
+---+---+



# Windows

In [212]:
df12 = spark.createDataFrame( [('a','m',1), ('b','m',2), ('c','n',3),
                              ('d','n',6)], ('A', 'B', 'C') )
df12.show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  a|  m|  1|
|  b|  m|  2|
|  c|  n|  3|
|  d|  n|  6|
+---+---+---+



In [218]:
# Define windows for difference
w=Window.partitionBy(df12.B) # partition by B col
D = df12.C - F.min(df12.C).over(w) # find min on basis of partition by B col
df12.withColumn('D',D).show()
D = df12.C - F.max(df12.C).over(w) # find max on basis of partition by B col
df12.withColumn('D',D).show()

+---+---+---+---+
|  A|  B|  C|  D|
+---+---+---+---+
|  a|  m|  1|  0|
|  b|  m|  2|  1|
|  c|  n|  3|  0|
|  d|  n|  6|  3|
+---+---+---+---+

+---+---+---+---+
|  A|  B|  C|  D|
+---+---+---+---+
|  a|  m|  1| -1|
|  b|  m|  2|  0|
|  c|  n|  3| -3|
|  d|  n|  6|  0|
+---+---+---+---+



In [215]:
# Define windows for row_num
df12_1=df12.withColumn("D",F.monotonically_increasing_id())
df12_1.show()
w1=Window.orderBy(df12_1.D) # partition by D col
df12_1.withColumn("D",F.row_number().over(w1)).show()

+---+---+---+-----------+
|  A|  B|  C|          D|
+---+---+---+-----------+
|  a|  m|  1|17179869184|
|  b|  m|  2|42949672960|
|  c|  n|  3|68719476736|
|  d|  n|  6|94489280512|
+---+---+---+-----------+

+---+---+---+---+
|  A|  B|  C|  D|
+---+---+---+---+
|  a|  m|  1|  1|
|  b|  m|  2|  2|
|  c|  n|  3|  3|
|  d|  n|  6|  4|
+---+---+---+---+



In [217]:
# Define windows for rank
w2=Window.partitionBy("B").orderBy(df12.C.desc())
df12.withColumn("D",F.rank().over(w2)).show()

+---+---+---+---+
|  A|  B|  C|  D|
+---+---+---+---+
|  b|  m|  2|  1|
|  a|  m|  1|  2|
|  d|  n|  6|  1|
|  c|  n|  3|  2|
+---+---+---+---+



# custom functions

In [12]:
# count non-null values
def my_count(df):
    return df.agg(*[F.count(c).alias(c) for c in df.columns])
my_count(df4).show()

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  3|        5|       5|
+---+---------+--------+



In [14]:
def my_sum(df):
    return df.agg(*[F.sum(c).alias(c) for c in df.columns])
my_sum(df4).show()

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  7|     null|    null|
+---+---------+--------+



# UDF

In [90]:
def complexF(x,y):
    return [x*i for i in y]
    
Fn=F.udf(lambda x,y: complexF(x,y),ArrayType(IntegerType()))
df1.withColumn('2col',Fn(df1.id,df1.rating)).select('id','2col').show(5,truncate=False)

+---+---------------------------------------------------------+
|id |2col                                                     |
+---+---------------------------------------------------------+
|1  |[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]|
|2  |[0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]|
|3  |[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0]|
|4  |[0, 4, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]|
|5  |[0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0]|
+---+---------------------------------------------------------+
only showing top 5 rows



In [89]:
def complexF(x,y):
    return [x*i for i in y]
    
Fn=F.udf(complexF, ArrayType(IntegerType()))
df1.withColumn('2col',Fn(df1.id,df1.rating)).select('id','2col').show(5,truncate=False)

+---+---------------------------------------------------------+
|id |2col                                                     |
+---+---------------------------------------------------------+
|1  |[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]|
|2  |[0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0]|
|3  |[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0]|
|4  |[0, 4, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]|
|5  |[0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0]|
+---+---------------------------------------------------------+
only showing top 5 rows



In [92]:
def merge_two_columns(col1, col2):
    return([str(col1), float(col2)])
df1_struct_type = StructType([
    StructField('f1', StringType()),
    StructField('f2', FloatType())
])
df1_merge_two_columns_udf = F.udf(merge_two_columns, returnType=df1_struct_type)
df1.select(df1_merge_two_columns_udf(df1.title, df1.id)).limit(2).toPandas()

Unnamed: 0,"merge_two_columns(title, id)"
0,"(Toy Story (1995), 1.0)"
1,"(GoldenEye (1995), 2.0)"


# date

In [27]:
df17=df1.withColumn("date",to_date(col("date"),"dd-MMM-yyyy"))
df17.show(2)

+----------+---+--------------------+--------------------+----------------+
|      date| id|                link|              rating|           title|
+----------+---+--------------------+--------------------+----------------+
|1995-01-01|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|Toy Story (1995)|
|1995-01-01|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|GoldenEye (1995)|
+----------+---+--------------------+--------------------+----------------+
only showing top 2 rows



In [55]:
df21=df1.withColumn("date",to_date(col("date"),"dd-MMM-yyyy")).filter(
    col("date")==F.lit('1995-10-30'))
df21.show(2)

+----------+---+--------------------+--------------------+--------------------+
|      date| id|                link|              rating|               title|
+----------+---+--------------------+--------------------+--------------------+
|1995-10-30| 13|http://us.imdb.co...|[0, 0, 0, 0, 0, 1...|Mighty Aphrodite ...|
+----------+---+--------------------+--------------------+--------------------+



In [56]:
df21.select(F.month('date'),F.year('date')).show(2)

+-----------+----------+
|month(date)|year(date)|
+-----------+----------+
|         10|      1995|
+-----------+----------+



In [57]:
df21.select(F.dayofmonth('date'),F.dayofweek('date'),F.dayofyear('date')).show(2)

+----------------+---------------+---------------+
|dayofmonth(date)|dayofweek(date)|dayofyear(date)|
+----------------+---------------+---------------+
|              30|              2|            303|
+----------------+---------------+---------------+



In [38]:
df17.select(F.min(col('date')),F.max(col('date'))).show()
df17.select(F.date_sub(F.min(col('date')),3),F.date_add(F.max(col('date')),3)).show()

+----------+----------+
| min(date)| max(date)|
+----------+----------+
|1922-01-01|1998-10-23|
+----------+----------+

+----------------------+----------------------+
|date_sub(min(date), 3)|date_add(max(date), 3)|
+----------------------+----------------------+
|            1921-12-29|            1998-10-26|
+----------------------+----------------------+



In [39]:
df18=spark.createDataFrame([('2019-12-25 13:30:00',)],['Christmas'])
df18.show()
df18.select(F.to_date(col('Christmas'),'yyyy-MM-dd HH:mm:ss').alias('to_date'),
           F.to_timestamp(col('Christmas'),'yyyy-MM-dd HH:mm:ss').alias('to_timestamp')).show()

+-------------------+
|          Christmas|
+-------------------+
|2019-12-25 13:30:00|
+-------------------+

+----------+-------------------+
|   to_date|       to_timestamp|
+----------+-------------------+
|2019-12-25|2019-12-25 13:30:00|
+----------+-------------------+



In [41]:
df19=spark.createDataFrame([('25/Dec/2019 13:30:00',)],['Christmas'])
df19.show()
df19.select(F.to_date(col('Christmas'),'dd/MMM/yyyy HH:mm:ss').alias('to_date'),
           F.to_timestamp(col('Christmas'),'dd/MMM/yyyy HH:mm:ss').alias('to_timestamp')).show()
df18.select(F.to_date(col('Christmas'),'dd/MMM/yyyy HH:mm:ss').alias('to_date'),
           F.to_timestamp(col('Christmas'),'dd/MMM/yyyy HH:mm:ss').alias('to_timestamp')).show()

+--------------------+
|           Christmas|
+--------------------+
|25/Dec/2019 13:30:00|
+--------------------+

+----------+-------------------+
|   to_date|       to_timestamp|
+----------+-------------------+
|2019-12-25|2019-12-25 13:30:00|
+----------+-------------------+

+-------+------------+
|to_date|to_timestamp|
+-------+------------+
|   null|        null|
+-------+------------+



In [43]:
df20=spark.createDataFrame([('12/25/2019 01:30:00 PM',)],['Christmas'])
df20.show(truncate=False)
df20.select(F.to_date(col('Christmas'),'MM/dd/yyyy hh:mm:ss aa').alias('to_date'),
           F.to_timestamp(col('Christmas'),'MM/dd/yyyy hh:mm:ss aa').alias('to_timestamp')).show()


+----------------------+
|Christmas             |
+----------------------+
|12/25/2019 01:30:00 PM|
+----------------------+

+----------+-------------------+
|   to_date|       to_timestamp|
+----------+-------------------+
|2019-12-25|2019-12-25 13:30:00|
+----------+-------------------+



In [65]:
df1.withColumn("t",F.split("title"," ")[0]).show()

+-----------+---+--------------------+--------------------+--------------------+---------+
|       date| id|                link|              rating|               title|        t|
+-----------+---+--------------------+--------------------+--------------------+---------+
|01-Jan-1995|  1|http://us.imdb.co...|[0, 0, 0, 1, 1, 1...|    Toy Story (1995)|      Toy|
|01-Jan-1995|  2|http://us.imdb.co...|[0, 1, 1, 0, 0, 0...|    GoldenEye (1995)|GoldenEye|
|01-Jan-1995|  3|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|   Four Rooms (1995)|     Four|
|01-Jan-1995|  4|http://us.imdb.co...|[0, 1, 0, 0, 0, 1...|   Get Shorty (1995)|      Get|
|01-Jan-1995|  5|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|      Copycat (1995)|  Copycat|
|01-Jan-1995|  6|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Shanghai Triad (Y...| Shanghai|
|01-Jan-1995|  7|http://us.imdb.co...|[0, 0, 0, 0, 0, 0...|Twelve Monkeys (1...|   Twelve|
|01-Jan-1995|  8|http://us.imdb.co...|[0, 0, 0, 0, 1, 1...|         Babe (1995)|     Babe|