In [1]:
import findspark
findspark.init()

In [None]:
spark.stop()

In [3]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("SparkDF").getOrCreate()
sc=spark.sparkContext

# Creating DF (From RDDs)

In [12]:
rdd1=sc.textFile("data/ml-100k/u.item")
rdd1.take(3)

['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0',
 '2|GoldenEye (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?GoldenEye%20(1995)|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0',
 '3|Four Rooms (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0']

In [53]:
# df1.coalesce(1).write.save("item.parquet",format="parquet")

In [13]:
rdd1_split=rdd1.map(lambda x: x.split("|"))

In [23]:
from pyspark.sql import Row
rdd1_data=rdd1_split.map(lambda x: Row(id=int(x[0]), title=x[1],date=x[2],link=x[4]))
print(rdd1_data.take(3))

[Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', title='Toy Story (1995)'), Row(date='01-Jan-1995', id=2, link='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', title='GoldenEye (1995)'), Row(date='01-Jan-1995', id=3, link='http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)', title='Four Rooms (1995)')]


In [46]:
# way-1
df1=spark.createDataFrame(rdd1_data)
print(df1.printSchema())
print(df1.show(15))

root
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- link: string (nullable = true)
 |-- title: string (nullable = true)

None
+-----------+---+--------------------+--------------------+
|       date| id|                link|               title|
+-----------+---+--------------------+--------------------+
|01-Jan-1995|  1|http://us.imdb.co...|    Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...|    GoldenEye (1995)|
|01-Jan-1995|  3|http://us.imdb.co...|   Four Rooms (1995)|
|01-Jan-1995|  4|http://us.imdb.co...|   Get Shorty (1995)|
|01-Jan-1995|  5|http://us.imdb.co...|      Copycat (1995)|
|01-Jan-1995|  6|http://us.imdb.co...|Shanghai Triad (Y...|
|01-Jan-1995|  7|http://us.imdb.co...|Twelve Monkeys (1...|
|01-Jan-1995|  8|http://us.imdb.co...|         Babe (1995)|
|01-Jan-1995|  9|http://us.imdb.co...|Dead Man Walking ...|
|22-Jan-1996| 10|http://us.imdb.co...|  Richard III (1995)|
|01-Jan-1995| 11|http://us.imdb.co...|Seven (Se7en) (1995)|
|14-Aug-

In [44]:
# way-2
from pyspark.sql.types import StructField,StructType,IntegerType,StringType,TimestampType
from pyspark.sql.functions import to_date,col


schema2=StructType([
    StructField("id",IntegerType(),True),
    StructField("title",StringType(),True),
    StructField("date",StringType(),True),
])
df2=spark.createDataFrame(rdd1_data,schema2) \
    .withColumn("date",to_date(col("date"),"dd-MMM-yyyy"))
df2.show(15)

+---+--------------------+----------+
| id|               title|      date|
+---+--------------------+----------+
|  1|    Toy Story (1995)|1995-01-01|
|  2|    GoldenEye (1995)|1995-01-01|
|  3|   Four Rooms (1995)|1995-01-01|
|  4|   Get Shorty (1995)|1995-01-01|
|  5|      Copycat (1995)|1995-01-01|
|  6|Shanghai Triad (Y...|1995-01-01|
|  7|Twelve Monkeys (1...|1995-01-01|
|  8|         Babe (1995)|1995-01-01|
|  9|Dead Man Walking ...|1995-01-01|
| 10|  Richard III (1995)|1996-01-22|
| 11|Seven (Se7en) (1995)|1995-01-01|
| 12|Usual Suspects, T...|1995-08-14|
| 13|Mighty Aphrodite ...|1995-10-30|
| 14|  Postino, Il (1994)|1994-01-01|
| 15|Mr. Holland's Opu...|1996-01-29|
+---+--------------------+----------+
only showing top 15 rows



In [88]:
# way-3
spark_df = spark.createDataFrame(
    [
        (1, "Mark", "Brown"), 
        (2, "Tom", "Anderson"), 
        (3, "Joshua", "Peterson")
    ], 
    ('id', 'firstName', 'lastName')
)
spark_df.show()

+---+---------+--------+
| id|firstName|lastName|
+---+---------+--------+
|  1|     Mark|   Brown|
|  2|      Tom|Anderson|
|  3|   Joshua|Peterson|
+---+---------+--------+



# Creating DF (From Spark Data Sources)

In [52]:
spark.read.json("data/ml-100k/item.json").show(3)
spark.read.load("data/ml-100k/item.json", format="json").show(3)
spark.read.format("json").load("data/ml-100k/item.json").show(3)
spark.read.format("json").json("data/ml-100k/item.json").show(3)
spark.read.load("data/ml-100k/item.json").show(3)

#  Invalid ways
spark.read.load("data/ml-100k/item.json").format("json").show(3)

+-----------+---+--------------------+-----------------+
|       date| id|                link|            title|
+-----------+---+--------------------+-----------------+
|01-Jan-1995|  1|http://us.imdb.co...| Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...| GoldenEye (1995)|
|01-Jan-1995|  3|http://us.imdb.co...|Four Rooms (1995)|
+-----------+---+--------------------+-----------------+
only showing top 3 rows



In [54]:
spark.read.parquet("data/ml-100k/item.parquet").show(3)
spark.read.load("data/ml-100k/item.parquet", format="parquet").show(3)
spark.read.format("parquet").load("data/ml-100k/item.parquet").show(3)
spark.read.format("parquet").parquet("data/ml-100k/item.parquet").show(3)
spark.read.load("data/ml-100k/item.parquet").show(3)

#  Invalid ways
spark.read.load("data/ml-100k/item.parquet").format("parquet").show(3)

+-----------+---+--------------------+-----------------+
|       date| id|                link|            title|
+-----------+---+--------------------+-----------------+
|01-Jan-1995|  1|http://us.imdb.co...| Toy Story (1995)|
|01-Jan-1995|  2|http://us.imdb.co...| GoldenEye (1995)|
|01-Jan-1995|  3|http://us.imdb.co...|Four Rooms (1995)|
+-----------+---+--------------------+-----------------+
only showing top 3 rows



In [61]:
spark.read.text("data/textFiles/sample1.txt").show(3)

+--------------------+
|               value|
+--------------------+
|Utilitatis causa ...|
|Lorem ipsum dolor...|
|                    |
+--------------------+
only showing top 3 rows



# Inspect Data

In [64]:
df1.dtypes # Return df column names and data types

[('date', 'string'), ('id', 'bigint'), ('link', 'string'), ('title', 'string')]

In [92]:
df1.show(2,vertical=True,truncate=True)

-RECORD 0---------------------
 date  | 01-Jan-1995          
 id    | 1                    
 link  | http://us.imdb.co... 
 title | Toy Story (1995)     
-RECORD 1---------------------
 date  | 01-Jan-1995          
 id    | 2                    
 link  | http://us.imdb.co... 
 title | GoldenEye (1995)     
only showing top 2 rows



In [70]:
df1.head(2) # Return first n rows

[Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', title='Toy Story (1995)'),
 Row(date='01-Jan-1995', id=2, link='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', title='GoldenEye (1995)')]

In [71]:
df1.head() # Return first row

Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', title='Toy Story (1995)')

In [66]:
df1.first() # Return first row

Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', title='Toy Story (1995)')

In [72]:
df1.take(2) # Return the first n rows

[Row(date='01-Jan-1995', id=1, link='http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', title='Toy Story (1995)'),
 Row(date='01-Jan-1995', id=2, link='http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', title='GoldenEye (1995)')]

In [73]:
df1.schema # Return the schema of df

StructType(List(StructField(date,StringType,true),StructField(id,LongType,true),StructField(link,StringType,true),StructField(title,StringType,true)))

In [75]:
df1.printSchema() # Return the schema of df

root
 |-- date: string (nullable = true)
 |-- id: long (nullable = true)
 |-- link: string (nullable = true)
 |-- title: string (nullable = true)



In [95]:
df1.describe().show(vertical=True) # Compute summary statistics

-RECORD 0-----------------------
 summary | count                
 date    | 1682                 
 id      | 1682                 
 link    | 1682                 
 title   | 1682                 
-RECORD 1-----------------------
 summary | mean                 
 date    | null                 
 id      | 841.5                
 link    | null                 
 title   | null                 
-RECORD 2-----------------------
 summary | stddev               
 date    | null                 
 id      | 485.69589250888254   
 link    | null                 
 title   | null                 
-RECORD 3-----------------------
 summary | min                  
 date    |                      
 id      | 1                    
 link    |                      
 title   | 'Til There Was Yo... 
-RECORD 4-----------------------
 summary | max                  
 date    | 4-Feb-1971           
 id      | 1682                 
 link    | http://us.imdb.co... 
 title   | � k�ldum klaka (C... 



In [96]:
df1.columns # Return the columns of df

['date', 'id', 'link', 'title']

In [97]:
df1.count() # Count the number of rows in df

1682

In [98]:
df1.distinct().count() # Count the number of distinct rows in df

1682

In [99]:
df1.explain() # Print the (logical and physical) plans

== Physical Plan ==
Scan ExistingRDD[date#368,id#369L,link#370,title#371]


# Duplicate Values

In [100]:
df1_dup = df1.dropDuplicates()
df1_dup.count() 

1682