In [38]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.functions as f
from pyspark.sql import Row
import sys
import numpy as np

In [3]:
sc = SparkContext("local[*]", "NFL")

In [78]:
rdd = sc.textFile('../in/PlayerTrackData.csv')

In [79]:
# show header
rdd.take(1)

['PlayKey,time,event,x,y,dir,dis,o,s']

In [80]:
# remove header
rdd = rdd.filter(lambda x: 'PlayKey' not in x)

In [81]:
# split the col
rdd = rdd.map(lambda x: x.split(','))

In [83]:
# remove row if missing some value
rdd = rdd.filter(lambda x: x[5] != '' and x[7] != '')

In [85]:
# convert to dataframe object
sqlContext = SQLContext(sc)
df = rdd.toDF(['PlayKey','time','event','x','y','dir','dis','o','s'])

# if df want to use rdd function
# just df.rdd.xxx()

In [86]:
df.select('PlayKey','s').show()

+---------+----+
|  PlayKey|   s|
+---------+----+
|26624-1-1|0.13|
|26624-1-1|0.12|
|26624-1-1|0.12|
|26624-1-1| 0.1|
|26624-1-1|0.09|
|26624-1-1|0.07|
|26624-1-1|0.05|
|26624-1-1|0.02|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
+---------+----+
only showing top 20 rows



In [87]:
# printSchema
df.printSchema()

root
 |-- PlayKey: string (nullable = true)
 |-- time: string (nullable = true)
 |-- event: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- dir: string (nullable = true)
 |-- dis: string (nullable = true)
 |-- o: string (nullable = true)
 |-- s: string (nullable = true)



In [100]:
# speed
# player_mean_speed_df = df.rdd.map(lambda x: [x[0].split('-')[0], float(x[-1])]).toDF(["playerKey","s"])
# # remove extreme small speed
# player_mean_speed_df = player_mean_speed_df.rdd.filter(lambda x:x.s >= 1).toDF(["playerKey","s"])
# # get average speed
# player_mean_speed = player_mean_speed_df.groupBy('playerKey').agg(f.mean("s").alias('mean'))
# result = player_mean_speed.collect()

In [88]:
# convert string to float
df = df.withColumn("s", df.s.cast('float').alias("s"))
df = df.withColumn("o", df.o.cast('float').alias("o"))
df = df.withColumn("dir", df.dir.cast('float').alias("dir"))

In [89]:
df.printSchema()

root
 |-- PlayKey: string (nullable = true)
 |-- time: string (nullable = true)
 |-- event: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- dir: float (nullable = true)
 |-- dis: string (nullable = true)
 |-- o: float (nullable = true)
 |-- s: float (nullable = true)



In [90]:
# add new col.
# compute the difference between o and dir
def add_o_dir_diff(x):
    x = x.asDict()
    x['o_dir_diff'] = min([abs(x['o']-x['dir']),abs(x['o']-(x['dir']-360)),abs(x['o']-(x['dir']+360))])
    
    x['is_forward_move'] = False
    x['is_sideway_move'] = False
    x['is_backward_move'] = False
    
    if x['o_dir_diff'] <= 45:
        x['move_type'] = 'forward'
        x['is_forward_move'] = True
    elif x['o_dir_diff'] <= 135:
        x['move_type'] = 'sideway'
        x['is_sideway_move'] = True
    else:
        x['move_type'] = 'backward'
        x['is_backward_move'] = True
        
    return Row(**x)

rdd = df.rdd.map(add_o_dir_diff)

In [91]:
df = rdd.toDF()

In [99]:
# get the distinct event in player trace data
# res = df.select('event').distinct().collect()

In [None]:
# we only focus on ball-snap (the most common event in player trace data)
_df = df.rdd.filter(lambda x: x.event == 'ball-snap').toDF()
_df = _df.withColumnRenamed('x','x_snap').withColumnRenamed('y','y_snap').withColumnRenamed('time','time_snap')