# DataFrames and Spark SQL applied exercise

In [1]:
import findspark
findspark.init()

import pandas as pd
import pyspark

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[*]")\
        .appName('PySpark_Df')\
        .getOrCreate()

In [5]:
## Import the csv of "data/WorldCupPlayers.csv"
## Visualize the data
df = spark.read.csv("D:/Data_Engineering/Apache_Spark/data/WorldCupPlayers.csv", sep=",", header=True)
df.show()

+-------+-------+-------------+-------------------+-------+------------+-----------------+--------+---------+
|RoundID|MatchID|Team Initials|         Coach Name|Line-up|Shirt Number|      Player Name|Position|    Event|
+-------+-------+-------------+-------------------+-------+------------+-----------------+--------+---------+
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0|      Alex THEPOT|      GK|     NULL|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|  Oscar BONFIGLIO|      GK|     NULL|
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0| Marcel LANGILLER|    NULL|     G40'|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|     Juan CARRENO|    NULL|     G70'|
|    201|   1096|          FRA|CAUDRON Raoul (FRA)|      S|           0|  Ernest LIBERATI|    NULL|     NULL|
|    201|   1096|          MEX|   LUQUE Juan (MEX)|      S|           0|     Rafael GARZA|       C|     NULL|
|    201| 

In [6]:
## What type of data does each variable contain?
df.printSchema()

root
 |-- RoundID: string (nullable = true)
 |-- MatchID: string (nullable = true)
 |-- Team Initials: string (nullable = true)
 |-- Coach Name: string (nullable = true)
 |-- Line-up: string (nullable = true)
 |-- Shirt Number: string (nullable = true)
 |-- Player Name: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Event: string (nullable = true)



In [32]:
# change data type of matchid
from pyspark.sql.types import IntegerType
df = df.withColumn("MatchID", df["MatchID"].cast(IntegerType()))
df.printSchema()

root
 |-- RoundID: string (nullable = true)
 |-- MatchID: integer (nullable = true)
 |-- Team Initials: string (nullable = true)
 |-- Coach Name: string (nullable = true)
 |-- Line-up: string (nullable = true)
 |-- Shirt Number: string (nullable = true)
 |-- Player Name: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Event: string (nullable = true)



In [7]:
## How many records are there?
df.count()

37784

In [9]:
## Get the main statistics of Position
df.groupBy("Position").count().show()

+--------+-----+
|Position|count|
+--------+-----+
|    NULL|33641|
|      GK| 2441|
|       C| 1510|
|     GKC|  192|
+--------+-----+



In [10]:
## Select and display records, ommiting 'Player Name' and'Coach Name'
df.select([col for col in df.columns if col not in ['Player Name', 'Coach Name']]).show()

+-------+-------+-------------+-------+------------+--------+---------+
|RoundID|MatchID|Team Initials|Line-up|Shirt Number|Position|    Event|
+-------+-------+-------------+-------+------------+--------+---------+
|    201|   1096|          FRA|      S|           0|      GK|     NULL|
|    201|   1096|          MEX|      S|           0|      GK|     NULL|
|    201|   1096|          FRA|      S|           0|    NULL|     G40'|
|    201|   1096|          MEX|      S|           0|    NULL|     G70'|
|    201|   1096|          FRA|      S|           0|    NULL|     NULL|
|    201|   1096|          MEX|      S|           0|       C|     NULL|
|    201|   1096|          FRA|      S|           0|    NULL|G43' G87'|
|    201|   1096|          MEX|      S|           0|    NULL|     NULL|
|    201|   1096|          FRA|      S|           0|    NULL|     NULL|
|    201|   1096|          MEX|      S|           0|    NULL|     NULL|
|    201|   1096|          FRA|      S|           0|    NULL|   

In [13]:
## How many matches have been played with the ID of 1096?
df.filter(df.MatchID == 1096).count()

33

In [15]:
## Show the data where the position has been C and the event is G40
df.filter((df.Position == 'C') & (df.Event == 'G40\'')).show()

+-------+-------+-------------+--------------------+-------+------------+----------------+--------+-----+
|RoundID|MatchID|Team Initials|          Coach Name|Line-up|Shirt Number|     Player Name|Position|Event|
+-------+-------+-------------+--------------------+-------+------------+----------------+--------+-----+
|    201|   1089|          PAR|DURAND LAGUNA Jos...|      S|           0|Luis VARGAS PENA|       C| G40'|
|    429|   1175|          HUN|  DIETZ Karoly (HUN)|      S|           0|   Gyorgy SAROSI|       C| G40'|
+-------+-------+-------------+--------------------+-------+------------+----------------+--------+-----+



In [33]:
# order dataframe by matchid ascending
df.orderBy(df.MatchID.desc()).show()

+-------+---------+-------------+--------------------+-------+------------+-----------+--------+-----+
|RoundID|  MatchID|Team Initials|          Coach Name|Line-up|Shirt Number|Player Name|Position|Event|
+-------+---------+-------------+--------------------+-------+------------+-----------+--------+-----+
| 255931|300186515|          ECU|RUEDA Reinaldo (COL)|      S|          22|  DOMINGUEZ|      GK| NULL|
| 255931|300186515|          ECU|RUEDA Reinaldo (COL)|      S|          16|A. VALENCIA|       C| R50'|
| 255931|300186515|          FRA|DESCHAMPS Didier ...|      S|           1|     LLORIS|     GKC| NULL|
| 255931|300186515|          ECU|RUEDA Reinaldo (COL)|      S|           2|     GUAGUA|    NULL| NULL|
| 255931|300186515|          FRA|DESCHAMPS Didier ...|      S|           5|      SAKHO|    NULL| O61'|
| 255931|300186515|          ECU|RUEDA Reinaldo (COL)|      S|           3|      ERAZO|    NULL| Y83'|
| 255931|300186515|          FRA|DESCHAMPS Didier ...|      S|          1

In [38]:
## Use Spark SQL to display records where the MatchID is greater or equal to 20
df.createOrReplaceTempView("df")
# min_match_id = spark.sql("SELECT min(MatchID) FROM df").collect()[0][0]
# min_match_id
spark.sql("SELECT * FROM df WHERE MatchID >= 2000" ).count()



20166