### Juggling Data with Spark

#### Importing required libraries

In [1]:
# Import PySpark
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd

# creating spark context
sc = SparkContext.getOrCreate();
spark = SparkSession(sc)

22/09/21 08:19:53 WARN Utils: Your hostname, DESKTOP-1KI6I9N resolves to a loopback address: 127.0.1.1; using 192.168.1.10 instead (on interface eth0)
22/09/21 08:19:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/21 08:19:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sc

#### Importing data from csv file

In [3]:
url_github = 'https://raw.githubusercontent.com/fvgm-spec/csv_files/master/IMDB-Movie-Data.csv'

In [4]:
## Importing csv file as Spark DataFrame
pd_df = pd.read_csv(url_github)
spark_df = spark.createDataFrame(pd_df)
spark_df.limit(5).show()

                                                                                

+----+--------------------+--------------------+--------------------+--------------------+--------------------+----+-----------------+------+------+------------------+---------+
|Rank|               Title|               Genre|         Description|            Director|              Actors|Year|Runtime (Minutes)|Rating| Votes|Revenue (Millions)|Metascore|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+----+-----------------+------+------+------------------+---------+
|   1|Guardians of the ...|Action,Adventure,...|A group of interg...|          James Gunn|Chris Pratt, Vin ...|2014|              121|   8.1|757074|            333.13|     76.0|
|   2|          Prometheus|Adventure,Mystery...|Following clues t...|        Ridley Scott|Noomi Rapace, Log...|2012|              124|   7.0|485820|            126.46|     65.0|
|   3|               Split|     Horror,Thriller|Three girls are k...|  M. Night Shyamalan|James McAvoy, Any...

In [5]:
## Importing csv file as Spark DataFrame
pd_df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


#### Using PySpark functions over spark_df

In [6]:
##Displaying first 5 rows of spark_df
spark_df.show(5)

+----+--------------------+--------------------+--------------------+--------------------+--------------------+----+-----------------+------+------+------------------+---------+
|Rank|               Title|               Genre|         Description|            Director|              Actors|Year|Runtime (Minutes)|Rating| Votes|Revenue (Millions)|Metascore|
+----+--------------------+--------------------+--------------------+--------------------+--------------------+----+-----------------+------+------+------------------+---------+
|   1|Guardians of the ...|Action,Adventure,...|A group of interg...|          James Gunn|Chris Pratt, Vin ...|2014|              121|   8.1|757074|            333.13|     76.0|
|   2|          Prometheus|Adventure,Mystery...|Following clues t...|        Ridley Scott|Noomi Rapace, Log...|2012|              124|   7.0|485820|            126.46|     65.0|
|   3|               Split|     Horror,Thriller|Three girls are k...|  M. Night Shyamalan|James McAvoy, Any...

In [7]:
##Displaying columns from spark_df
spark_df.columns

['Rank',
 'Title',
 'Genre',
 'Description',
 'Director',
 'Actors',
 'Year',
 'Runtime (Minutes)',
 'Rating',
 'Votes',
 'Revenue (Millions)',
 'Metascore']

In [30]:
spark_df.count()

1000

In [8]:
##Describing spark_df
spark_df.select("Year","Runtime (Minutes)","Rating","Votes","Revenue (Millions)","Metascore").describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+---------+
|summary|              Year| Runtime (Minutes)|            Rating|             Votes|Revenue (Millions)|Metascore|
+-------+------------------+------------------+------------------+------------------+------------------+---------+
|  count|              1000|              1000|              1000|              1000|              1000|     1000|
|   mean|          2012.783|           113.172| 6.723199999999999|        169808.255|               NaN|      NaN|
| stddev|3.2059615077521775|18.810908172288393|0.9454287892779631|188762.64751822077|               NaN|      NaN|
|    min|              2006|                66|               1.9|                61|               0.0|     11.0|
|    max|              2016|               191|               9.0|           1791916|               NaN|      NaN|
+-------+------------------+------------------+------------------+--------------

In [9]:
##Displaying schema of the spark_df
spark_df.printSchema()

root
 |-- Rank: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Actors: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Runtime (Minutes): long (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Votes: long (nullable = true)
 |-- Revenue (Millions): double (nullable = true)
 |-- Metascore: double (nullable = true)



In [10]:
##Displaying spark_df vertically
spark_df.show(1, vertical=True)

-RECORD 0----------------------------------
 Rank               | 1                    
 Title              | Guardians of the ... 
 Genre              | Action,Adventure,... 
 Description        | A group of interg... 
 Director           | James Gunn           
 Actors             | Chris Pratt, Vin ... 
 Year               | 2014                 
 Runtime (Minutes)  | 121                  
 Rating             | 8.1                  
 Votes              | 757074               
 Revenue (Millions) | 333.13               
 Metascore          | 76.0                 
only showing top 1 row



In [11]:
spark_df.printSchema()

root
 |-- Rank: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Actors: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Runtime (Minutes): long (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Votes: long (nullable = true)
 |-- Revenue (Millions): double (nullable = true)
 |-- Metascore: double (nullable = true)



In [12]:
from functions.pandas_udf import vertical_df

vertical_df(pd_df)

#### Using UDF Pandas

In [23]:
from pyspark.sql.functions import pandas_udf

@pandas_udf("string")  # Datatype: string
def show_col(series: pd.Series) -> pd.Series:
    return series

In [24]:
spark_df.select(show_col(spark_df.Title)).show()

[Stage 13:>                                                         (0 + 1) / 1]

+--------------------+
|     show_col(Title)|
+--------------------+
|Guardians of the ...|
|          Prometheus|
|               Split|
|                Sing|
|       Suicide Squad|
|      The Great Wall|
|          La La Land|
|            Mindhorn|
|  The Lost City of Z|
|          Passengers|
|Fantastic Beasts ...|
|      Hidden Figures|
|           Rogue One|
|               Moana|
|            Colossal|
|The Secret Life o...|
|       Hacksaw Ridge|
|        Jason Bourne|
|                Lion|
|             Arrival|
+--------------------+
only showing top 20 rows



                                                                                

#### Grouping data

In [17]:
## Here we group the average Rating by each of the Genres in the dataset
## from 1000 rows in the dataframe, it gets 20 Genres with it avg Rating
spark_df.groupby('Genre').avg('Rating').show()



+--------------------+------------------+
|               Genre|       avg(Rating)|
+--------------------+------------------+
|Action,Adventure,...| 6.329629629629629|
|Adventure,Family,...|               6.7|
|       Comedy,Family| 6.300000000000001|
|       Action,Comedy| 6.366666666666666|
|Adventure,Comedy,...|            7.4125|
|    Mystery,Thriller| 6.685714285714285|
|Action,Comedy,Mys...|               7.9|
|Drama,Mystery,Rom...| 7.383333333333333|
|Action,Drama,Mystery| 6.699999999999999|
|Crime,Drama,Thriller| 6.858333333333333|
|  Action,Drama,Sport|7.3999999999999995|
|        Thriller,War|               6.0|
|Adventure,Drama,R...|               6.8|
|Adventure,Drama,S...|               8.3|
|Drama,Romance,Sci-Fi|             6.975|
|Drama,Mystery,Sci-Fi| 7.716666666666666|
|       Action,Sci-Fi|               7.3|
|     Action,Thriller| 6.577777777777778|
|               Drama| 6.935416666666666|
|   Drama,Fantasy,War|               8.2|
+--------------------+------------

                                                                                

#### Working with SparkSQL

In [31]:
spark_df.createOrReplaceTempView("mySQLtable")
spark.sql("SELECT count(*) from mySQLtable").show()

+--------+
|count(1)|
+--------+
|    1000|
+--------+



In [42]:
spark.sql("SELECT count(*) from mySQLtable where Rating > 7.7").show()

+--------+
|count(1)|
+--------+
|     141|
+--------+

