# Search and Filter DataFrames

Exploring DataFrames exercise

In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SearchAndFilter").getOrCreate()
spark

## Load Data

In [2]:
fifa = spark.read.csv("fifa19.csv", inferSchema=True, header=True)
fifa.limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [3]:
fifa.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Photo: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: integer (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: string (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- International Reputation: integer (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: string (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: integer (nullable = true)
 |-- Joined: string (nullable = true)
 |-- Loaned From: string (nu

## Query data

In [4]:
from pyspark.sql.functions import *

In [5]:
fifa.select(["Name", "Nationality", "Age", "Photo"]).show(n=5, truncate=False)

+-----------------+-----------+---+----------------------------------------------+
|Name             |Nationality|Age|Photo                                         |
+-----------------+-----------+---+----------------------------------------------+
|L. Messi         |Argentina  |31 |https://cdn.sofifa.org/players/4/19/158023.png|
|Cristiano Ronaldo|Portugal   |33 |https://cdn.sofifa.org/players/4/19/20801.png |
|Neymar Jr        |Brazil     |26 |https://cdn.sofifa.org/players/4/19/190871.png|
|De Gea           |Spain      |27 |https://cdn.sofifa.org/players/4/19/193080.png|
|K. De Bruyne     |Belgium    |27 |https://cdn.sofifa.org/players/4/19/192985.png|
+-----------------+-----------+---+----------------------------------------------+
only showing top 5 rows



In [6]:
fifa.select(["Name", "Nationality", "Age", "Photo"]).orderBy(fifa["Age"]).show(5)

+--------------+-----------+---+--------------------+
|          Name|Nationality|Age|               Photo|
+--------------+-----------+---+--------------------+
|     Y. Roemer|Netherlands| 16|https://cdn.sofif...|
|   J. Kitolano|     Norway| 16|https://cdn.sofif...|
|   Y. Begraoui|     France| 16|https://cdn.sofif...|
|Y. Verschaeren|    Belgium| 16|https://cdn.sofif...|
|      J. Lahne|     Sweden| 16|https://cdn.sofif...|
+--------------+-----------+---+--------------------+
only showing top 5 rows



In [7]:
fifa.select(["Name", "Nationality", "Age", "Photo"]).orderBy(fifa["Age"].desc()).show(5)

+-------------+-----------------+---+--------------------+
|         Name|      Nationality|Age|               Photo|
+-------------+-----------------+---+--------------------+
|     O. Pérez|           Mexico| 45|https://cdn.sofif...|
|K. Pilkington|          England| 44|https://cdn.sofif...|
|    T. Warner|Trinidad & Tobago| 44|https://cdn.sofif...|
|  S. Narazaki|            Japan| 42|https://cdn.sofif...|
|     M. Tyler|          England| 41|https://cdn.sofif...|
+-------------+-----------------+---+--------------------+
only showing top 5 rows



In [8]:
fifa.select(["*"]).where(fifa.Club.like("%América de Cali%")).limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,3356,227886,D. Buitrago,27,https://cdn.sofifa.org/players/4/19/227886.png,Colombia,https://cdn.sofifa.org/flags/56.png,72,73,América de Cali,...,68,13,22,26,8,14,6,6,12,€5.1M
1,4346,214392,J. Cuero,30,https://cdn.sofifa.org/players/4/19/214392.png,Colombia,https://cdn.sofifa.org/flags/56.png,71,71,América de Cali,...,66,48,32,35,9,15,12,7,10,€3M
2,4630,176605,P. Armero,31,https://cdn.sofifa.org/players/4/19/176605.png,Colombia,https://cdn.sofifa.org/flags/56.png,71,71,América de Cali,...,72,58,69,68,7,11,15,7,6,€2.1M
3,4921,211499,H. Quiñones,26,https://cdn.sofifa.org/players/4/19/211499.png,Colombia,https://cdn.sofifa.org/flags/56.png,70,71,América de Cali,...,69,65,66,64,10,10,15,11,13,€2.4M
4,5203,212351,P. Franco,27,https://cdn.sofifa.org/players/4/19/212351.png,Colombia,https://cdn.sofifa.org/flags/56.png,70,72,América de Cali,...,69,63,68,71,7,14,6,12,10,


In [9]:
fifa.select("Photo", fifa.Photo.substr(-4, 4)).show(n=5, truncate=False)

+----------------------------------------------+-----------------------+
|Photo                                         |substring(Photo, -4, 4)|
+----------------------------------------------+-----------------------+
|https://cdn.sofifa.org/players/4/19/158023.png|.png                   |
|https://cdn.sofifa.org/players/4/19/20801.png |.png                   |
|https://cdn.sofifa.org/players/4/19/190871.png|.png                   |
|https://cdn.sofifa.org/players/4/19/193080.png|.png                   |
|https://cdn.sofifa.org/players/4/19/192985.png|.png                   |
+----------------------------------------------+-----------------------+
only showing top 5 rows



In [10]:
fifa[fifa.Club.isin("Barcelona", "Juventus")].limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
1,15,211110,P. Dybala,24,https://cdn.sofifa.org/players/4/19/211110.png,Argentina,https://cdn.sofifa.org/flags/52.png,89,94,Juventus,...,84,23,20,20,5,4,4,5,8,€153.5M
2,24,138956,G. Chiellini,33,https://cdn.sofifa.org/players/4/19/138956.png,Italy,https://cdn.sofifa.org/flags/27.png,89,89,Juventus,...,84,93,93,90,3,3,2,4,3,€44.6M
3,64,191043,Alex Sandro,27,https://cdn.sofifa.org/players/4/19/191043.png,Brazil,https://cdn.sofifa.org/flags/54.png,86,86,Juventus,...,82,81,84,84,7,7,9,12,5,€60.2M


In [11]:
fifa.select("Name", "Club").where(fifa.Name.startswith("L")).where(fifa.Name.endswith("i")).show(5)
# fifa.select(["Name", "Club"]).where(fifa["Name"].startswith("L")).where(fifa["Name"].endswith("i")).show(5)

+-------------+---------------+
|         Name|           Club|
+-------------+---------------+
|     L. Messi|   FC Barcelona|
|   L. Bonucci|       Juventus|
| L. Fabiański|West Ham United|
|L. Pellegrini|           Roma|
| L. Pavoletti|       Cagliari|
+-------------+---------------+
only showing top 5 rows



In [12]:
fifa.count()

18207

In [13]:
df = fifa.limit(100)
df.count()

100

## Slicing

In [14]:
col_list = fifa.columns[0:5]
df3 = fifa.select(col_list)
df3.show()

+---+------+-----------------+---+--------------------+
|_c0|    ID|             Name|Age|               Photo|
+---+------+-----------------+---+--------------------+
|  0|158023|         L. Messi| 31|https://cdn.sofif...|
|  1| 20801|Cristiano Ronaldo| 33|https://cdn.sofif...|
|  2|190871|        Neymar Jr| 26|https://cdn.sofif...|
|  3|193080|           De Gea| 27|https://cdn.sofif...|
|  4|192985|     K. De Bruyne| 27|https://cdn.sofif...|
|  5|183277|        E. Hazard| 27|https://cdn.sofif...|
|  6|177003|        L. Modrić| 32|https://cdn.sofif...|
|  7|176580|        L. Suárez| 31|https://cdn.sofif...|
|  8|155862|     Sergio Ramos| 32|https://cdn.sofif...|
|  9|200389|         J. Oblak| 25|https://cdn.sofif...|
| 10|188545|   R. Lewandowski| 29|https://cdn.sofif...|
| 11|182521|         T. Kroos| 28|https://cdn.sofif...|
| 12|182493|         D. Godín| 32|https://cdn.sofif...|
| 13|168542|      David Silva| 32|https://cdn.sofif...|
| 14|215914|         N. Kanté| 27|https://cdn.so

In [15]:
len(df3.columns)

5

In [16]:
df = spark.createDataFrame([([1, 2, 3], ), ([4, 5], )], ["x"])
df.show()

+---------+
|        x|
+---------+
|[1, 2, 3]|
|   [4, 5]|
+---------+



In [17]:
df.select(slice(df.x, 2, 2).alias("Some Name")).show()

+---------+
|Some Name|
+---------+
|   [2, 3]|
|      [5]|
+---------+



## Filter

In [18]:
fifa.filter("Overall > 50").limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [19]:
fifa.filter("Overall > 50").select(["Name", "Age"]).limit(5).toPandas()

Unnamed: 0,Name,Age
0,L. Messi,31
1,Cristiano Ronaldo,33
2,Neymar Jr,26
3,De Gea,27
4,K. De Bruyne,27


In [20]:
result = fifa.filter("Overall > 50").select(["Nationality", "Name", "Age", "Overall"]).orderBy(fifa["Overall"].desc()).collect()
type(result[0])

pyspark.sql.types.Row

In [21]:
print("Best FIFA player is: {0} \n".format(result[0]["Name"]))
print("Worst FIFA player is: {0} \n".format(result[-1]["Name"]))

Best FIFA player is: L. Messi 

Worst FIFA player is: C. Addai 

