In [13]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.functions as f
from pyspark.sql import Row
from pyspark import StorageLevel
import sys
import numpy as np

In [2]:
sc = SparkContext("local[*]", "NFL")

In [3]:
rdd = sc.textFile('../in/PlayerTrackData.csv')

In [4]:
# show header
rdd.take(1)

['PlayKey,time,event,x,y,dir,dis,o,s']

In [5]:
# remove header
rdd = rdd.filter(lambda x: 'PlayKey' not in x)

In [6]:
# split the col
rdd = rdd.map(lambda x: x.split(','))

In [7]:
# remove row if missing some value
# rdd = rdd.filter(lambda x: x[5] != '' and x[7] != '')

In [8]:
# convert to dataframe object
sqlContext = SQLContext(sc)
df = rdd.toDF(['PlayKey','time','event','x','y','dir','dis','o','s'])

# if df want to use rdd function
# just df.rdd.xxx()

In [9]:
df.select('PlayKey','s').show()

+---------+----+
|  PlayKey|   s|
+---------+----+
|26624-1-1|0.13|
|26624-1-1|0.12|
|26624-1-1|0.12|
|26624-1-1| 0.1|
|26624-1-1|0.09|
|26624-1-1|0.07|
|26624-1-1|0.05|
|26624-1-1|0.02|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
+---------+----+
only showing top 20 rows



In [10]:
# printSchema
df.printSchema()

root
 |-- PlayKey: string (nullable = true)
 |-- time: string (nullable = true)
 |-- event: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- dir: string (nullable = true)
 |-- dis: string (nullable = true)
 |-- o: string (nullable = true)
 |-- s: string (nullable = true)



In [14]:
# get all ball_snap play record
ball_snap_tracks = df.select('*').where("event == 'ball_snap'")
# rename x,y,time for ball snap
drop_list = ['dir','dis','o','event','s']
ball_snap_tracks = ball_snap_tracks.withColumnRenamed('x','snap_x').withColumnRenamed('y','snap_y').withColumnRenamed('time','snap_time').drop(*drop_list)
# join two table (orign tracks and tracks only contained ball_snap event)
tracks = df.join(ball_snap_tracks, ['PlayKey'], "left_outer")
# remove any data for a play 0.1 second before snap
tracks = tracks.select('*').where('time >= (snap_time - 0.1)')
# tracks.persist(StorageLevel.MEMORY_ONLY)


DataFrame[PlayKey: string, time: string, event: string, x: string, y: string, dir: string, dis: string, o: string, s: string, snap_time: string, snap_x: string, snap_y: string]

In [34]:
l = tracks.count()
print(l)
tracks.show()

37087421
+-----------+----+---------+-----+-----+------+----+-----+----+---------+------+------+
|    PlayKey|time|    event|    x|    y|   dir| dis|    o|   s|snap_time|snap_x|snap_y|
+-----------+----+---------+-----+-----+------+----+-----+----+---------+------+------+
|26624-13-32|19.4|         |19.85| 23.9|267.19| 0.0|15.21|0.29|     19.5| 19.85|  23.9|
|26624-13-32|19.5|ball_snap|19.85| 23.9|266.62| 0.0|15.32|0.51|     19.5| 19.85|  23.9|
|26624-13-32|19.6|         |19.84| 23.9|266.04| 0.0|15.48|0.68|     19.5| 19.85|  23.9|
|26624-13-32|19.7|         |19.84| 23.9|265.88| 0.0|15.85| 0.8|     19.5| 19.85|  23.9|
|26624-13-32|19.8|         |19.84| 23.9|265.98| 0.0|16.76|0.89|     19.5| 19.85|  23.9|
|26624-13-32|19.9|         |19.83| 23.9|266.15|0.01|18.18|0.96|     19.5| 19.85|  23.9|
|26624-13-32|20.0|         |19.82| 23.9|266.43|0.01| 19.8|1.03|     19.5| 19.85|  23.9|
|26624-13-32|20.1|         | 19.8| 23.9| 266.2|0.02|21.42|1.11|     19.5| 19.85|  23.9|
|26624-13-32|20.2|     

In [None]:
# get last event and lasr time of each playkey
last_event_tracks = tracks.where('event != ""').groupBy('PlayKey').agg(f.max('time').alias('time'))

# join tracks record and last event tracks record
last_event_tracks = tracks.join(last_event_tracks, ['PlayKey','time'], "inner")
last_event_tracks = last_event_tracks[last_event_tracks.PlayKey,last_event_tracks.time,last_event_tracks.event]
last_event_tracks = last_event_tracks.withColumnRenamed('time','last_time').withColumnRenamed('event','last_event')
tracks = tracks.join(last_event_tracks, ['PlayKey'], 'left_outer')

# Remove any data for a play 0.1 second after last event
tracks = tracks.select('*').where('time <= (last_time + 0.1)')
tracks.persist(StorageLevel.MEMORY_ONLY)
tracks.show()
l = tracks.count()
print(l)






In [39]:
temp = last_event_tracks[last_event_tracks.PlayKey,last_event_tracks.time,last_event_tracks.event]

In [41]:
temp.show()

+-----------+----+--------------------+
|    PlayKey|time|               event|
+-----------+----+--------------------+
|  26624-1-6| 5.0|              tackle|
| 26624-1-70|24.7|              tackle|
|26624-11-18|19.3|       out_of_bounds|
| 26624-18-9| 8.8|           ball_snap|
|26624-23-24|18.3|pass_outcome_inco...|
|26624-26-28|24.9|pass_outcome_inco...|
| 26624-6-23| 5.7|              tackle|
|  26624-7-2|33.3|        penalty_flag|
| 26624-7-49| 8.8|           ball_snap|
| 27363-1-53|16.8|              tackle|
| 27363-1-62|18.6|pass_outcome_touc...|
|27363-10-63| 9.9|           ball_snap|
|27363-19-42|22.4|              tackle|
|27363-21-59|21.4|              tackle|
|27363-27-45|19.5|              tackle|
|  27363-3-6|16.8|              tackle|
|27363-30-51|51.2|        penalty_flag|
| 27363-4-62|18.4|              tackle|
| 27363-5-23| 9.1|             handoff|
|  27363-6-3|31.3|              tackle|
+-----------+----+--------------------+
only showing top 20 rows



In [100]:
# speed
# player_mean_speed_df = df.rdd.map(lambda x: [x[0].split('-')[0], float(x[-1])]).toDF(["playerKey","s"])
# # remove extreme small speed
# player_mean_speed_df = player_mean_speed_df.rdd.filter(lambda x:x.s >= 1).toDF(["playerKey","s"])
# # get average speed
# player_mean_speed = player_mean_speed_df.groupBy('playerKey').agg(f.mean("s").alias('mean'))
# result = player_mean_speed.collect()

In [88]:
# convert string to float
df = df.withColumn("s", df.s.cast('float').alias("s"))
df = df.withColumn("o", df.o.cast('float').alias("o"))
df = df.withColumn("dir", df.dir.cast('float').alias("dir"))

In [89]:
df.printSchema()

root
 |-- PlayKey: string (nullable = true)
 |-- time: string (nullable = true)
 |-- event: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- dir: float (nullable = true)
 |-- dis: string (nullable = true)
 |-- o: float (nullable = true)
 |-- s: float (nullable = true)



In [90]:
# add new col.
# compute the difference between o and dir
def add_o_dir_diff(x):
    x = x.asDict()
    x['o_dir_diff'] = min([abs(x['o']-x['dir']),abs(x['o']-(x['dir']-360)),abs(x['o']-(x['dir']+360))])
    
    x['is_forward_move'] = False
    x['is_sideway_move'] = False
    x['is_backward_move'] = False
    
    if x['o_dir_diff'] <= 45:
        x['move_type'] = 'forward'
        x['is_forward_move'] = True
    elif x['o_dir_diff'] <= 135:
        x['move_type'] = 'sideway'
        x['is_sideway_move'] = True
    else:
        x['move_type'] = 'backward'
        x['is_backward_move'] = True
        
    return Row(**x)

rdd = df.rdd.map(add_o_dir_diff)

In [91]:
df = rdd.toDF()

In [99]:
# get the distinct event in player trace data
# res = df.select('event').distinct().collect()

In [101]:
# we only focus on ball-snap (the most common event in player trace data)
# rename some columns
_df = df.rdd.filter(lambda x: x.event == 'ball-snap').toDF()
_df = _df.withColumnRenamed('x','x_snap').withColumnRenamed('y','y_snap').withColumnRenamed('time','time_snap')

KeyboardInterrupt: 

In [None]:
# Remove any data for a play 0.1 second before snap
df = df.merge(_df, on='PlayKey', how='left')
df = df.rdd.filter(lambda x: x.time >= x.time_snap-0.1)

# Remove any data for a play 0.1 second after snap
# TODO