# PySpark RDD Examples on Film dataset

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession 

#### PySpark environment settings

In [3]:
pyspark = SparkSession.builder \
.master("local[4]")\
.appName("RDDExamples")\
.config("spark.executer.memory","4g")\
.config("spark.driver.memory","4g")\
.getOrCreate()

In [4]:
sc = pyspark.sparkContext

In [5]:
film_data = sc.textFile("data/film_data.csv")

In [6]:
film_data.take(10)

['Name,Genre,Length,Score,Country,Year,Budget',
 'stand by Me,Adventure,89,8.1,USA,1986,8000000',
 "ferris Bueller's Day Off,Comedy,103,7.8,USA,1986,6000000",
 'Top Gun,Action,110,6.9,USA,1986,15000000',
 'Aliens,Action,137,8.4,USA,1986,18500000',
 'Flight of the Navigator,Adventure,90,6.9,USA,1986,9000000',
 'Platoon,Drama,120,8.1,UK,1986,6000000',
 'Labyrinth,Adventure,101,7.4,UK,1986,25000000',
 'Blue Velvet,Drama,120,7.8,USA,1986,6000000',
 'Pretty in Pink,Comedy,96,6.8,USA,1986,9000000']

#### map() function by using upper()

In [7]:
film_data.map(lambda x: x.upper()).take(10)

['NAME,GENRE,LENGTH,SCORE,COUNTRY,YEAR,BUDGET',
 'STAND BY ME,ADVENTURE,89,8.1,USA,1986,8000000',
 "FERRIS BUELLER'S DAY OFF,COMEDY,103,7.8,USA,1986,6000000",
 'TOP GUN,ACTION,110,6.9,USA,1986,15000000',
 'ALIENS,ACTION,137,8.4,USA,1986,18500000',
 'FLIGHT OF THE NAVIGATOR,ADVENTURE,90,6.9,USA,1986,9000000',
 'PLATOON,DRAMA,120,8.1,UK,1986,6000000',
 'LABYRINTH,ADVENTURE,101,7.4,UK,1986,25000000',
 'BLUE VELVET,DRAMA,120,7.8,USA,1986,6000000',
 'PRETTY IN PINK,COMEDY,96,6.8,USA,1986,9000000']

#### map() function by using lower()

In [8]:
film_data.map(lambda x: x.lower()).take(10)

['name,genre,length,score,country,year,budget',
 'stand by me,adventure,89,8.1,usa,1986,8000000',
 "ferris bueller's day off,comedy,103,7.8,usa,1986,6000000",
 'top gun,action,110,6.9,usa,1986,15000000',
 'aliens,action,137,8.4,usa,1986,18500000',
 'flight of the navigator,adventure,90,6.9,usa,1986,9000000',
 'platoon,drama,120,8.1,uk,1986,6000000',
 'labyrinth,adventure,101,7.4,uk,1986,25000000',
 'blue velvet,drama,120,7.8,usa,1986,6000000',
 'pretty in pink,comedy,96,6.8,usa,1986,9000000']

#### flatMap() function by using upper()

In [9]:
film_data.flatMap(lambda x: x.upper()).take(10)

['N', 'A', 'M', 'E', ',', 'G', 'E', 'N', 'R', 'E']

#### flatMap() function by using lower()

In [10]:
film_data.flatMap(lambda x: x.lower()).take(10)

['n', 'a', 'm', 'e', ',', 'g', 'e', 'n', 'r', 'e']

#### Removing columns row

In [11]:
film_data = film_data.filter(lambda x: "Name" not in x)

In [12]:
film_data.take(10)

['stand by Me,Adventure,89,8.1,USA,1986,8000000',
 "ferris Bueller's Day Off,Comedy,103,7.8,USA,1986,6000000",
 'Top Gun,Action,110,6.9,USA,1986,15000000',
 'Aliens,Action,137,8.4,USA,1986,18500000',
 'Flight of the Navigator,Adventure,90,6.9,USA,1986,9000000',
 'Platoon,Drama,120,8.1,UK,1986,6000000',
 'Labyrinth,Adventure,101,7.4,UK,1986,25000000',
 'Blue Velvet,Drama,120,7.8,USA,1986,6000000',
 'Pretty in Pink,Comedy,96,6.8,USA,1986,9000000',
 'The Fly,Drama,96,7.5,USA,1986,15000000']

#### Splitting element by using flatMap()

In [13]:
flatMap = film_data.flatMap(lambda x: x.split(","))
flatMap.take(10)

['stand by Me',
 'Adventure',
 '89',
 '8.1',
 'USA',
 '1986',
 '8000000',
 "ferris Bueller's Day Off",
 'Comedy',
 '103']

#### Splitting element by using map()

In [14]:
mapData = film_data.map(lambda x: x.split(","))
mapData.take(10)

[['stand by Me', 'Adventure', '89', '8.1', 'USA', '1986', '8000000'],
 ["ferris Bueller's Day Off",
  'Comedy',
  '103',
  '7.8',
  'USA',
  '1986',
  '6000000'],
 ['Top Gun', 'Action', '110', '6.9', 'USA', '1986', '15000000'],
 ['Aliens', 'Action', '137', '8.4', 'USA', '1986', '18500000'],
 ['Flight of the Navigator',
  'Adventure',
  '90',
  '6.9',
  'USA',
  '1986',
  '9000000'],
 ['Platoon', 'Drama', '120', '8.1', 'UK', '1986', '6000000'],
 ['Labyrinth', 'Adventure', '101', '7.4', 'UK', '1986', '25000000'],
 ['Blue Velvet', 'Drama', '120', '7.8', 'USA', '1986', '6000000'],
 ['Pretty in Pink', 'Comedy', '96', '6.8', 'USA', '1986', '9000000'],
 ['The Fly', 'Drama', '96', '7.5', 'USA', '1986', '15000000']]

#### Mapping elements from 0 to 6 in tuple type

In [15]:
mapData = film_data.map(lambda x: x.split(",")).map(lambda y: (y[0],y[1],y[2],y[3],y[4],y[5],y[6]))
mapData.take(10)

[('stand by Me', 'Adventure', '89', '8.1', 'USA', '1986', '8000000'),
 ("ferris Bueller's Day Off",
  'Comedy',
  '103',
  '7.8',
  'USA',
  '1986',
  '6000000'),
 ('Top Gun', 'Action', '110', '6.9', 'USA', '1986', '15000000'),
 ('Aliens', 'Action', '137', '8.4', 'USA', '1986', '18500000'),
 ('Flight of the Navigator',
  'Adventure',
  '90',
  '6.9',
  'USA',
  '1986',
  '9000000'),
 ('Platoon', 'Drama', '120', '8.1', 'UK', '1986', '6000000'),
 ('Labyrinth', 'Adventure', '101', '7.4', 'UK', '1986', '25000000'),
 ('Blue Velvet', 'Drama', '120', '7.8', 'USA', '1986', '6000000'),
 ('Pretty in Pink', 'Comedy', '96', '6.8', 'USA', '1986', '9000000'),
 ('The Fly', 'Drama', '96', '7.5', 'USA', '1986', '15000000')]

#### Highest 10 film budget

In [16]:
budgetData = film_data.map(lambda x: x.split(",")).map(lambda y: (int(y[6]),y[0]))
budgetData.sortByKey(False).take(10)

[(40000000, 'Piratas'),
 (35000000, 'Howard the Duck'),
 (25000000, 'Labyrinth'),
 (25000000, 'Big Trouble in Little China'),
 (25000000, 'Little Shop of Horrors'),
 (25000000, 'Star Trek IV: The Voyage Home'),
 (25000000, 'The Golden Child'),
 (25000000, '�Three Amigos!'),
 (25000000, 'Cobra'),
 (25000000, 'The Mosquito Coast')]

#### The most highest scored films

In [17]:
highScore = film_data.map(lambda x: x.split(",")).map(lambda y: (float(y[3]),y[0]))
highScore.sortByKey(False).take(10)

[(8.4, 'Aliens'),
 (8.1, 'stand by Me'),
 (8.1, 'Platoon'),
 (8.1, 'Sacrifice'),
 (8.0, 'Hannah and Her Sisters'),
 (7.8, "ferris Bueller's Day Off"),
 (7.8, 'Blue Velvet'),
 (7.8, 'Down by Law'),
 (7.8, 'When the Wind Blows'),
 (7.6, 'Hoosiers')]

#### The longest films by type of minute

In [18]:
length = film_data.map(lambda x: x.split(",")).map(lambda y: (float(y[2]),y[0]))
length.sortByKey(False).take(10)

[(149.0, 'Sacrifice'),
 (137.0, 'Aliens'),
 (130.0, 'Heartbreak Ridge'),
 (125.0, 'The Mission'),
 (125.0, 'The Delta Force'),
 (121.0, 'Piratas'),
 (120.0, 'Platoon'),
 (120.0, 'Blue Velvet'),
 (120.0, 'Manhunter'),
 (120.0, 'Betty Blue')]

#### Pandas and RDD DataFrame

In [19]:
import pandas as pd 
df = pd.read_csv("data/film_data.csv",encoding="ISO-8859-1")
df.head()

Unnamed: 0,Name,Genre,Length,Score,Country,Year,Budget
0,stand by Me,Adventure,89,8.1,USA,1986,8000000
1,ferris Bueller's Day Off,Comedy,103,7.8,USA,1986,6000000
2,Top Gun,Action,110,6.9,USA,1986,15000000
3,Aliens,Action,137,8.4,USA,1986,18500000
4,Flight of the Navigator,Adventure,90,6.9,USA,1986,9000000


In [20]:
pandasRdd = pyspark.createDataFrame(df)

In [21]:
pandasRdd.show(5)

+--------------------+---------+------+-----+-------+----+--------+
|                Name|    Genre|Length|Score|Country|Year|  Budget|
+--------------------+---------+------+-----+-------+----+--------+
|         stand by Me|Adventure|    89|  8.1|    USA|1986| 8000000|
|ferris Bueller's ...|   Comedy|   103|  7.8|    USA|1986| 6000000|
|             Top Gun|   Action|   110|  6.9|    USA|1986|15000000|
|              Aliens|   Action|   137|  8.4|    USA|1986|18500000|
|Flight of the Nav...|Adventure|    90|  6.9|    USA|1986| 9000000|
+--------------------+---------+------+-----+-------+----+--------+
only showing top 5 rows



In [22]:
pandasRdd.take(5)

[Row(Name='stand by Me', Genre='Adventure', Length=89, Score=8.1, Country='USA', Year='1986', Budget=8000000),
 Row(Name="ferris Bueller's Day Off", Genre='Comedy', Length=103, Score=7.8, Country='USA', Year='1986', Budget=6000000),
 Row(Name='Top Gun', Genre='Action', Length=110, Score=6.9, Country='USA', Year='1986', Budget=15000000),
 Row(Name='Aliens', Genre='Action', Length=137, Score=8.4, Country='USA', Year='1986', Budget=18500000),
 Row(Name='Flight of the Navigator', Genre='Adventure', Length=90, Score=6.9, Country='USA', Year='1986', Budget=9000000)]

In [23]:
pandasRdd.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Length: long (nullable = true)
 |-- Score: double (nullable = true)
 |-- Country: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Budget: long (nullable = true)



#### Summarize the dataset (count, mean, standard deviation, min, max)

In [24]:
pandasRdd.describe().show()

+-------+--------------+--------+------------------+------------------+-------+--------------------+-----------------+
|summary|          Name|   Genre|            Length|             Score|Country|                Year|           Budget|
+-------+--------------+--------+------------------+------------------+-------+--------------------+-----------------+
|  count|            99|      99|                99|                99|     99|                  99|               99|
|   mean|          null|    null|             103.0|  6.46969696969697|   null|              1986.0|  8515010.1010101|
| stddev|          null|    null|13.659115219201741|0.9129962497916303|   null|                 0.0|9285823.428473907|
|    min|    52 Pick-Up|  Action|                74|               3.8|  Films|                1986|                0|
|    max|¡Three Amigos!|Thriller|               149|               8.4|    USA|Savage Steve Holland|         40000000|
+-------+--------------+--------+---------------