In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn import model_selection

In [2]:
df = pd.read_csv('../input/data.csv')
df.columns

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat',
       'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'season', 'seconds_remaining', 'shot_distance', 'shot_made_flag',
       'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range',
       'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id'],
      dtype='object')

In [3]:
# the data is not sorted. The shot with id=1 is not the first shot chronologicaly
# calculate seconds_to_end = number of seconds left before the end of the period
df['seconds_to_end'] = (df['minutes_remaining'] * 60) + df['seconds_remaining']

# sort the dataframe by date, period ascendant and seconds_to_end descendant
df = df.sort_values(by=['game_date','period','seconds_to_end'], ascending=[True,True,False] )

df[['game_event_id','game_date','period','minutes_remaining','seconds_remaining','seconds_to_end']].head(20)

Unnamed: 0,game_event_id,game_date,period,minutes_remaining,seconds_remaining,seconds_to_end
22901,102,1996-11-03,1,0,42,42
22902,127,1996-11-05,2,10,8,608
22903,124,1996-11-06,2,8,37,517
22904,144,1996-11-06,2,6,34,394
22905,151,1996-11-06,2,5,27,327
22906,157,1996-11-08,2,7,18,438
22907,226,1996-11-08,2,2,16,136
22908,321,1996-11-08,3,3,25,205
22909,334,1996-11-08,3,1,53,113
22910,337,1996-11-08,3,1,14,74


## Data Cleaning

we first remove the columns we don't want

In [4]:
df.columns

Index(['action_type', 'combined_shot_type', 'game_event_id', 'game_id', 'lat',
       'loc_x', 'loc_y', 'lon', 'minutes_remaining', 'period', 'playoffs',
       'season', 'seconds_remaining', 'shot_distance', 'shot_made_flag',
       'shot_type', 'shot_zone_area', 'shot_zone_basic', 'shot_zone_range',
       'team_id', 'team_name', 'game_date', 'matchup', 'opponent', 'shot_id',
       'seconds_to_end'],
      dtype='object')

In [5]:
# lat,lon,loc_x and loc_y are correlated
df.drop(['lat','lon'], axis=1, inplace=True)

# we don't need game_id and game_event_id
df.drop(['game_id','game_event_id'], axis=1, inplace=True)

# Kobe has always played with the LA Lakers
df.drop(['team_id','team_name'], axis=1, inplace=True)

# we added the seconds to end of period
df.drop(['minutes_remaining','seconds_remaining'], axis=1, inplace=True)

In [6]:
df.columns

Index(['action_type', 'combined_shot_type', 'loc_x', 'loc_y', 'period',
       'playoffs', 'season', 'shot_distance', 'shot_made_flag', 'shot_type',
       'shot_zone_area', 'shot_zone_basic', 'shot_zone_range', 'game_date',
       'matchup', 'opponent', 'shot_id', 'seconds_to_end'],
      dtype='object')

## Data Transformation
### Create new features

In [7]:
# make home/away feature from matchup
# if contains @, this is away game. home game otherwise
def home(row):
    try:
        row['matchup'].index('@')
        return 'away'
    except:
        return 'home'
        
df['venue'] = df.apply(home, axis = 1)

# we don't need matchup anymore because it's recurrent with opponent and tenure
df.drop('matchup', axis=1, inplace=True)

In [8]:
# add year and month and week day from game_date
df['year'] = pd.DatetimeIndex(df['game_date']).year
df['month'] = pd.DatetimeIndex(df['game_date']).month
df['weekday'] = pd.DatetimeIndex(df['game_date']).weekday
#df.drop('game_date', axis=1, inplace=True)

In [9]:
# add some moving averages
periods = [10,20,50,100,200]

for n in periods:
    col = 'MA' + str(n)
    df[col] = df['shot_made_flag'].fillna(0.45).rolling(window=n).mean()
    #df[col] = pd.rolling_mean(df['shot_made_flag'],n)    

In [10]:
df[['MA10','MA20','MA50','MA100','MA200']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30697 entries, 22901 to 22900
Data columns (total 5 columns):
MA10     30688 non-null float64
MA20     30678 non-null float64
MA50     30648 non-null float64
MA100    30598 non-null float64
MA200    30498 non-null float64
dtypes: float64(5)
memory usage: 1.4 MB


### Encode categorical variables

In [11]:
cat_cols = ['action_type','combined_shot_type','period','season','shot_type','shot_zone_area',
            'shot_zone_basic','shot_zone_range','year','month','weekday','opponent','venue']

for col in cat_cols:
    dummies = pd.get_dummies(df[col], prefix=col)
    df.drop(col, axis=1, inplace=True)
    df = df.join(dummies)

In [12]:
df.columns

Index(['loc_x', 'loc_y', 'playoffs', 'shot_distance', 'shot_made_flag',
       'game_date', 'shot_id', 'seconds_to_end', 'MA10', 'MA20',
       ...
       'opponent_POR', 'opponent_SAC', 'opponent_SAS', 'opponent_SEA',
       'opponent_TOR', 'opponent_UTA', 'opponent_VAN', 'opponent_WAS',
       'venue_away', 'venue_home'],
      dtype='object', length=195)

In [13]:
df.head()

Unnamed: 0,loc_x,loc_y,playoffs,shot_distance,shot_made_flag,game_date,shot_id,seconds_to_end,MA10,MA20,...,opponent_POR,opponent_SAC,opponent_SAS,opponent_SEA,opponent_TOR,opponent_UTA,opponent_VAN,opponent_WAS,venue_away,venue_home
22901,-140,116,0,18,0.0,1996-11-03,22902,42,,,...,0,0,0,0,0,0,0,0,0,1
22902,-131,97,0,16,0.0,1996-11-05,22903,608,,,...,0,0,0,0,0,0,0,0,1,0
22903,-142,181,0,23,1.0,1996-11-06,22904,517,,,...,0,0,0,0,0,0,0,0,1,0
22904,0,0,0,0,0.0,1996-11-06,22905,394,,,...,0,0,0,0,0,0,0,0,1,0
22905,-10,138,0,13,1.0,1996-11-06,22906,327,,,...,0,0,0,0,0,0,0,0,1,0


## Export the processed data

In [14]:
df.dropna(subset=['shot_made_flag']).to_pickle('../input/processed_train_data.pickle')
df[df['shot_made_flag'].isnull()].to_pickle('../input/processed_test_data.pickle')

# export not split data
df.to_pickle('../input/processed_data.pickle')