In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder\
                    .appName("Analyzing soccer players")\
                    .getOrCreate()

In [5]:
players = spark.read\
              .format("csv")\
              .option("header","true")\
              .load("/home/demo/player.csv")

In [6]:
players.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)



In [7]:
players.show(10)

+---+-------------+------------------+------------------+-------------------+------+------+
| id|player_api_id|       player_name|player_fifa_api_id|           birthday|height|weight|
+---+-------------+------------------+------------------+-------------------+------+------+
|  1|       505942|Aaron Appindangoye|            218353|1992-02-29 00:00:00|182.88|   187|
|  2|       155782|   Aaron Cresswell|            189615|1989-12-15 00:00:00|170.18|   146|
|  3|       162549|       Aaron Doran|            186170|1991-05-13 00:00:00|170.18|   163|
|  4|        30572|     Aaron Galindo|            140161|1982-05-08 00:00:00|182.88|   198|
|  5|        23780|      Aaron Hughes|             17725|1979-11-08 00:00:00|182.88|   154|
|  6|        27316|        Aaron Hunt|            158138|1986-09-04 00:00:00|182.88|   161|
|  7|       564793|        Aaron Kuhl|            221280|1996-01-30 00:00:00|172.72|   146|
|  8|        30895|      Aaron Lennon|            152747|1987-04-16 00:00:00| 16

In [51]:
players_attributes=spark.read\
                        .format("csv")\
                        .option("header","true")\
                         .load("/home/demo/player_attributes.csv")

In [52]:
players_attributes.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- overall_rating: string (nullable = true)
 |-- potential: string (nullable = true)
 |-- preferred_foot: string (nullable = true)
 |-- attacking_work_rate: string (nullable = true)
 |-- defensive_work_rate: string (nullable = true)
 |-- crossing: string (nullable = true)
 |-- finishing: string (nullable = true)
 |-- heading_accuracy: string (nullable = true)
 |-- short_passing: string (nullable = true)
 |-- volleys: string (nullable = true)
 |-- dribbling: string (nullable = true)
 |-- curve: string (nullable = true)
 |-- free_kick_accuracy: string (nullable = true)
 |-- long_passing: string (nullable = true)
 |-- ball_control: string (nullable = true)
 |-- acceleration: string (nullable = true)
 |-- sprint_speed: string (nullable = true)
 |-- agility: string (nullable = true)
 |-- reactions: string (nullable = true

In [10]:
players.count(),players_attributes.count()

(11060, 183978)

In [11]:
players_attributes.select("player_api_id").distinct().count()

11060

In [12]:
players.columns

['id',
 'player_api_id',
 'player_name',
 'player_fifa_api_id',
 'birthday',
 'height',
 'weight']

In [13]:
players=players.drop('id','player_fifa_api_id')
players.columns

['player_api_id', 'player_name', 'birthday', 'height', 'weight']

In [14]:
players=players.dropna()
players_attributes=players_attributes.dropna()

In [15]:
players.count(),players_attributes.count()

(11060, 180354)

In [17]:
from pyspark.sql.functions import udf

In [18]:
players_attributes.select("date").show(10)

+-------------------+
|               date|
+-------------------+
|2016-02-18 00:00:00|
|2015-11-19 00:00:00|
|2015-09-21 00:00:00|
|2015-03-20 00:00:00|
|2007-02-22 00:00:00|
|2016-04-21 00:00:00|
|2016-04-07 00:00:00|
|2016-01-07 00:00:00|
|2015-12-24 00:00:00|
|2015-12-17 00:00:00|
+-------------------+
only showing top 10 rows



In [54]:
year_extract_udf=udf(lambda date:date.split('-')[0])
players_attributes=players_attributes.withColumn("year",year_extract_udf(players_attributes.date))

In [55]:
players_attributes.columns

['id',
 'player_fifa_api_id',
 'player_api_id',
 'date',
 'overall_rating',
 'potential',
 'preferred_foot',
 'attacking_work_rate',
 'defensive_work_rate',
 'crossing',
 'finishing',
 'heading_accuracy',
 'short_passing',
 'volleys',
 'dribbling',
 'curve',
 'free_kick_accuracy',
 'long_passing',
 'ball_control',
 'acceleration',
 'sprint_speed',
 'agility',
 'reactions',
 'balance',
 'shot_power',
 'jumping',
 'stamina',
 'strength',
 'long_shots',
 'aggression',
 'interceptions',
 'positioning',
 'vision',
 'penalties',
 'marking',
 'standing_tackle',
 'sliding_tackle',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes',
 'year']

In [56]:
players_attributes=players_attributes.drop('date')

In [57]:
players_attributes.select("year").show(10)

+----+
|year|
+----+
|2016|
|2015|
|2015|
|2015|
|2007|
|2016|
|2016|
|2016|
|2015|
|2015|
+----+
only showing top 10 rows



In [58]:
pa_2016=players_attributes.filter(players_attributes.year==2016)

In [59]:
pa_2016.count()

14103

In [25]:
pa_2016.select(pa_2016.player_api_id).distinct().count()

5586

In [26]:
pa_striker_2016=pa_2016.groupBy('player_api_id').agg({
    'finishing':'avg',
    'shot_power':'avg',
    'acceleration':'avg'
})

In [27]:
pa_striker_2016.count()

5586

In [28]:
pa_striker_2016.show(10)

+-------------+-----------------+-----------------+---------------+
|player_api_id|   avg(finishing)|avg(acceleration)|avg(shot_power)|
+-------------+-----------------+-----------------+---------------+
|       309726|75.44444444444444|74.11111111111111|           76.0|
|        26112|             53.0|             51.0|           76.0|
|        38433|            68.25|             74.0|           74.0|
|       295060|             25.0|             62.0|           40.0|
|       161396|             29.0|             72.0|           69.0|
|        37774|             61.0|             64.0|           68.0|
|        41157|             81.0|             87.0|           80.0|
|        40740|             58.0|             73.5|           75.0|
|        31432|             14.0|             59.0|           65.0|
|       109653|             62.0|             65.0|           83.5|
+-------------+-----------------+-----------------+---------------+
only showing top 10 rows



In [29]:
pa_striker_2016.collect()

[Row(player_api_id='309726', avg(finishing)=75.44444444444444, avg(acceleration)=74.11111111111111, avg(shot_power)=76.0),
 Row(player_api_id='26112', avg(finishing)=53.0, avg(acceleration)=51.0, avg(shot_power)=76.0),
 Row(player_api_id='38433', avg(finishing)=68.25, avg(acceleration)=74.0, avg(shot_power)=74.0),
 Row(player_api_id='295060', avg(finishing)=25.0, avg(acceleration)=62.0, avg(shot_power)=40.0),
 Row(player_api_id='161396', avg(finishing)=29.0, avg(acceleration)=72.0, avg(shot_power)=69.0),
 Row(player_api_id='37774', avg(finishing)=61.0, avg(acceleration)=64.0, avg(shot_power)=68.0),
 Row(player_api_id='41157', avg(finishing)=81.0, avg(acceleration)=87.0, avg(shot_power)=80.0),
 Row(player_api_id='40740', avg(finishing)=58.0, avg(acceleration)=73.5, avg(shot_power)=75.0),
 Row(player_api_id='31432', avg(finishing)=14.0, avg(acceleration)=59.0, avg(shot_power)=65.0),
 Row(player_api_id='109653', avg(finishing)=62.0, avg(acceleration)=65.0, avg(shot_power)=83.5),
 Row(play

In [30]:
pa_striker_2016=pa_striker_2016.withColumnRenamed("avg(finishing)","finishing")\
                               .withColumnRenamed("avg(shot_power)","shot_power")\
                               .withColumnRenamed("avg(acceleration)","acceleration")

In [31]:
pa_striker_2016.show(10)

+-------------+-----------------+-----------------+----------+
|player_api_id|        finishing|     acceleration|shot_power|
+-------------+-----------------+-----------------+----------+
|       309726|75.44444444444444|74.11111111111111|      76.0|
|        26112|             53.0|             51.0|      76.0|
|        38433|            68.25|             74.0|      74.0|
|       295060|             25.0|             62.0|      40.0|
|       161396|             29.0|             72.0|      69.0|
|        37774|             61.0|             64.0|      68.0|
|        41157|             81.0|             87.0|      80.0|
|        40740|             58.0|             73.5|      75.0|
|        31432|             14.0|             59.0|      65.0|
|       109653|             62.0|             65.0|      83.5|
+-------------+-----------------+-----------------+----------+
only showing top 10 rows



In [32]:
weight_finishing=1
weight_shot_power=2
weight_acceleration=1

total_weight=weight_finishing+weight_shot_power+weight_acceleration

In [33]:
strikers=pa_striker_2016.withColumn("stiker_grade",(pa_striker_2016.finishing* weight_finishing+ \
                                                   pa_striker_2016.shot_power* weight_shot_power+ \
                                                    pa_striker_2016.acceleration* weight_acceleration
                                                   )/total_weight)

In [34]:
strikers=strikers.drop('finishing','acceleration','shot_power')
strikers.columns

['player_api_id', 'stiker_grade']

In [35]:
strikers=strikers.filter(strikers.stiker_grade > 70 )\
                 .sort(strikers.stiker_grade.desc())

strikers.show()

+-------------+-----------------+
|player_api_id|     stiker_grade|
+-------------+-----------------+
|        20276|            89.25|
|        37412|             89.0|
|        38817|            88.75|
|        32118|            88.25|
|        31921|             87.0|
|        30834|            86.75|
|       303824|85.10714285714286|
|       129944|             85.0|
|       158263|            84.75|
|       150565|            84.75|
|        25759|84.66666666666667|
|       156726|             84.5|
|       169193|          84.4375|
|       286119|84.42857142857143|
|        30348|           84.375|
|        93447|            84.25|
|        46509|            84.25|
|        50047|            84.25|
|       181276|             84.0|
|       178812|             84.0|
+-------------+-----------------+
only showing top 20 rows



In [36]:
strikers.count(),players.count()

(1609, 11060)

In [37]:
striker_details=players.join(strikers,players.player_api_id==strikers.player_api_id)

In [38]:
striker_details.columns

['player_api_id',
 'player_name',
 'birthday',
 'height',
 'weight',
 'player_api_id',
 'stiker_grade']

In [39]:
striker_details.count()

1609

In [40]:
striker_details=players.join(strikers,['player_api_id'])

In [41]:
striker_details.show(5)

+-------------+--------------+-------------------+------+------+------------+
|player_api_id|   player_name|           birthday|height|weight|stiker_grade|
+-------------+--------------+-------------------+------+------+------------+
|        20276|          Hulk|1986-07-25 00:00:00|180.34|   187|       89.25|
|        37412| Sergio Aguero|1988-06-02 00:00:00|172.72|   163|        89.0|
|        38817|  Carlos Tevez|1984-02-05 00:00:00|172.72|   157|       88.75|
|        32118|Lukas Podolski|1985-06-04 00:00:00|182.88|   183|       88.25|
|        31921|   Gareth Bale|1989-07-16 00:00:00|182.88|   163|        87.0|
+-------------+--------------+-------------------+------+------+------------+
only showing top 5 rows



In [42]:
from pyspark.sql.functions import broadcast

In [46]:
striker_details=players.select("player_api_id","player_name").join(broadcast(strikers),['player_api_id'],'inner')

In [48]:
striker_details=striker_details.sort(striker_details.stiker_grade.desc())

In [49]:
striker_details.show(5)

+-------------+--------------+------------+
|player_api_id|   player_name|stiker_grade|
+-------------+--------------+------------+
|        20276|          Hulk|       89.25|
|        37412| Sergio Aguero|        89.0|
|        38817|  Carlos Tevez|       88.75|
|        32118|Lukas Podolski|       88.25|
|        31921|   Gareth Bale|        87.0|
+-------------+--------------+------------+
only showing top 5 rows



In [60]:
players.count(),players_attributes.count()

(11060, 183978)

In [63]:
players_heading_acc=players_attributes.select('player_api_id','heading_accuracy')\
                                     .join(broadcast(players),players_attributes.player_api_id == players.player_api_id)

In [64]:
players_heading_acc.columns

['player_api_id',
 'heading_accuracy',
 'player_api_id',
 'player_name',
 'birthday',
 'height',
 'weight']

In [65]:
short_count=spark.sparkContext.accumulator(0)
medium_low_count=spark.sparkContext.accumulator(0)
medium_high_count=spark.sparkContext.accumulator(0)
tail_count=spark.sparkContext.accumulator(0)

In [66]:
def count_players_by_height(row) :
    height = float (row.height)
    
    if (height <= 175):
        short_count.add(1)
    elif(height <=183 and height > 175):
        medium_low_count.add(1)
    elif(height <=195 and height > 183):
        medium_high_count.add(1)
    elif(height > 195):
        tail_count.add(1)

In [68]:
players_heading_acc.foreach( lambda x : count_players_by_height(x))

In [70]:
all_players=[short_count,medium_low_count,medium_high_count,tail_count]
all_players

[Accumulator<id=0, value=19204>,
 Accumulator<id=1, value=98958>,
 Accumulator<id=2, value=62411>,
 Accumulator<id=3, value=3405>]

In [71]:
pa_2016.show(2)

+---+------------------+-------------+--------------+---------+--------------+-------------------+-------------------+--------+---------+----------------+-------------+-------+---------+-----+------------------+------------+------------+------------+------------+-------+---------+-------+----------+-------+-------+--------+----------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+----+
| id|player_fifa_api_id|player_api_id|overall_rating|potential|preferred_foot|attacking_work_rate|defensive_work_rate|crossing|finishing|heading_accuracy|short_passing|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|sprint_speed|agility|reactions|balance|shot_power|jumping|stamina|strength|long_shots|aggression|interceptions|positioning|vision|penalties|marking|standing_tackle|sliding_tackle|gk_diving|gk_handling|gk_kicking|gk_positioning|gk_reflexes|year|
+---+-----

In [72]:
pa_2016.select("player_api_id","overall_rating")\
       .coalesce(1)\#creates single file  from all partitions
       .write\
       .option("header","true")\
       .csv("/home/output/player_overall.csv")

In [73]:
pa_2016.select("player_api_id","overall_rating")\
       .write\
       .json("/home/output/player_overall.json")