In [38]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.functions as f
from pyspark.sql import Row
import sys
import numpy as np

In [3]:
sc = SparkContext("local[*]", "NFL")

In [4]:
rdd = sc.textFile('../in/PlayerTrackData.csv')

In [5]:
# show header
rdd.take(1)

['PlayKey,time,event,x,y,dir,dis,o,s']

In [6]:
# remove header
rdd = rdd.filter(lambda x: 'PlayKey' not in x)

In [7]:
# split the col
rdd = rdd.map(lambda x: x.split(','))

In [8]:
# convert to dataframe object
sqlContext = SQLContext(sc)
df = rdd.toDF(['PlayKey','time','event','x','y','dir','dis','o','s'])

# if df want to use rdd function
# just df.rdd.xxx()

In [9]:
df.select('PlayKey','s').show()

+---------+----+
|  PlayKey|   s|
+---------+----+
|26624-1-1|0.13|
|26624-1-1|0.12|
|26624-1-1|0.12|
|26624-1-1| 0.1|
|26624-1-1|0.09|
|26624-1-1|0.07|
|26624-1-1|0.05|
|26624-1-1|0.02|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1|0.01|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
|26624-1-1| 0.0|
+---------+----+
only showing top 20 rows



In [10]:
# printSchema
df.printSchema()

root
 |-- PlayKey: string (nullable = true)
 |-- time: string (nullable = true)
 |-- event: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- dir: string (nullable = true)
 |-- dis: string (nullable = true)
 |-- o: string (nullable = true)
 |-- s: string (nullable = true)



In [13]:
# player_mean_speed_df = df.rdd.map(lambda x: [x[0].split('-')[0], float(x[-1])]).toDF(["playerKey","s"])
# # remove extreme small speed
# player_mean_speed_df = player_mean_speed_df.rdd.filter(lambda x:x.s >= 1).toDF(["playerKey","s"])
# # get average speed
# player_mean_speed = player_mean_speed_df.groupBy('playerKey').agg(f.mean("s").alias('mean'))
# result = player_mean_speed.collect()

In [17]:
# convert string to float
df = df.withColumn("s", df.s.cast('float').alias("s"))
df = df.withColumn("o", df.o.cast('float').alias("o"))
df = df.withColumn("dir", df.dir.cast('float').alias("dir"))

In [18]:
df.printSchema()

root
 |-- PlayKey: string (nullable = true)
 |-- time: string (nullable = true)
 |-- event: string (nullable = true)
 |-- x: string (nullable = true)
 |-- y: string (nullable = true)
 |-- dir: float (nullable = true)
 |-- dis: string (nullable = true)
 |-- o: float (nullable = true)
 |-- s: float (nullable = true)



In [45]:
# add new col.
# compute the difference between o and dir
# df.withColumn('o_dir_diff',min([np.abs(df.o-df.dir),np.abs(df.o-(df.dir-360)),np.abs(df.o-(df.dir+360))]))


def add_o_dir_diff(x):
    x = x.asDict()
    x['o_dir_diff'] = min([abs(x['o']-x['dir']),abs(x['o']-(x['dir']-360)),abs(x['o']-(x['dir']+360))])
    
    if x['o_dir_diff'] <= 45:
        x['move_type'] = 'forward'
    elif x['o_dir_diff'] <= 135:
        x['move_type'] = 'sideway'
    else:
        x['move_type'] = 'backward'
        
    return Row(**x)

rdd = df.rdd.map(add_o_dir_diff)

In [46]:
rdd.take(20)

[Row(PlayKey='26624-1-1', dir=288.239990234375, dis='0.01', event='huddle_start_offense', move_type='forward', o=262.3299865722656, o_dir_diff=25.910003662109375, s=0.12999999523162842, time='0.0', x='87.46', y='28.93'),
 Row(PlayKey='26624-1-1', dir=283.9100036621094, dis='0.01', event='', move_type='forward', o=261.69000244140625, o_dir_diff=22.220001220703125, s=0.11999999731779099, time='0.1', x='87.45', y='28.92'),
 Row(PlayKey='26624-1-1', dir=280.3999938964844, dis='0.01', event='', move_type='forward', o=261.1700134277344, o_dir_diff=19.22998046875, s=0.11999999731779099, time='0.2', x='87.44', y='28.92'),
 Row(PlayKey='26624-1-1', dir=278.7900085449219, dis='0.01', event='', move_type='forward', o=260.6600036621094, o_dir_diff=18.1300048828125, s=0.10000000149011612, time='0.3', x='87.44', y='28.92'),
 Row(PlayKey='26624-1-1', dir=275.44000244140625, dis='0.01', event='', move_type='forward', o=260.2699890136719, o_dir_diff=15.170013427734375, s=0.09000000357627869, time='0.4'