In [1]:
import numpy as np
import pandas as pd

In [7]:
def gen_data(week):
    # Reading the CSV files
    week1 = pd.read_csv(f"tracking_week_{week}.csv", dtype={"nflId": str, "playId": str})
    plays = pd.read_csv("plays.csv", dtype={"playId": str})
    tackles = pd.read_csv("tackles.csv", dtype={"nflId": str, "playId": str})

    # Transforming the tackles dataframe
    tackles["t_val"] = np.where(tackles["tackle"] == 1, 1,
                                np.where(tackles["assist"] == 1, 0.5, 0))
    tackles["event"] = "tackle"
    tackle_title = tackles[["gameId", "playId", "nflId", "event", "t_val"]]

    # Extracting the football data
    football = week1[week1["displayName"] == "football"]
    football = football[["gameId", "playId", "frameId", "x", "y", "s", "a"]]
    football["playId"] = football["gameId"].astype(str) + "_" + football["playId"]

    # Extracting possession team data
    poss_tm = plays[["gameId", "playId", "possessionTeam"]]
    poss_tm["playId"] = poss_tm["gameId"].astype(str) + "_" + poss_tm["playId"]

    # Start building the main dataframe df
    df = week1[week1["displayName"] != "football"]
    df["playId"] = df["gameId"].astype(str) + "_" + df["playId"]

    # Merging football data with df using left join
    df = df.merge(football, on=["playId", "frameId"], how="left", suffixes=("_p", "_f"))

    df = df.merge(poss_tm, on=["playId"], how="left")
    df["poss_tm"] = np.where(df["club"] == df["possessionTeam"], 1, 0)
    df = df.merge(tackle_title, on=["gameId", "playId", "nflId", "event"], how="left")
    df["t_val"].fillna(0, inplace=True)
    df["ball_dist"] = np.sqrt((df["x_f"] - df["x_p"])**2 + (df["y_f"] - df["y_p"])**2)
    df = df[["playId", "nflId", "frameId", "s_p", "a_p", "s_f", "a_f", "ball_dist", "poss_tm", "t_val"]]

    # Sorting by playId, nflId, and frameId
    # df = df.sort_values(by=["playId", "nflId", "frameId"])

    return df


In [8]:
df = gen_data('1')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  poss_tm["playId"] = poss_tm["gameId"].astype(str) + "_" + poss_tm["playId"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["playId"] = df["gameId"].astype(str) + "_" + df["playId"]


In [9]:
df.head

<bound method NDFrame.head of                   playId  nflId  frameId   s_p   a_p        s_f        a_f  \
0          2022090800_56  35472        1  1.62  1.15  22.209999  11.850000   
1          2022090800_56  35472        2  1.67  0.61  20.900000  13.820000   
2          2022090800_56  35472        3  1.57  0.49  19.000000  16.020000   
3          2022090800_56  35472        4  1.44  0.89  17.280001  15.400000   
4          2022090800_56  35472        5  1.29  1.24  13.360000  20.459999   
...                  ...    ...      ...   ...   ...        ...        ...   
1346241  2022091200_3826  54618       49  1.88  2.49   2.560000   1.250000   
1346242  2022091200_3826  54618       50  1.84  2.35   2.500000   1.140000   
1346243  2022091200_3826  54618       51  1.85  1.98   2.380000   1.700000   
1346244  2022091200_3826  54618       52  1.85  1.69   2.070000   2.830000   
1346245  2022091200_3826  54618       53  1.80  1.37   1.860000   3.000000   

         ball_dist  poss_tm  t_va

In [10]:
from sklearn.preprocessing import StandardScaler

# Assume df is the dataframe from gen_data
features = ['s_p', 'a_p', 's_f', 'a_f', 'ball_dist', 'poss_tm']
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])


In [11]:
def transform_to_sequences(df, sequence_length):
    sequences = []
    targets = []
    
    play_ids = df['playId'].unique()
    
    for play_id in play_ids:
        play_data = df[df['playId'] == play_id].sort_values(by='frameId')
        
        for i in range(0, play_data.shape[0] - sequence_length):
            sequences.append(play_data[features].iloc[i:i+sequence_length].values)
            targets.append(play_data['t_val'].iloc[i+sequence_length-1])
    
    return np.array(sequences), np.array(targets)

sequence_length = 10
X, y = transform_to_sequences(df, sequence_length)


In [None]:
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Take the output from the last time step
        out = self.sigmoid(out)
        return out


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train[:, None], dtype=torch.float32)  # Adding an extra dimension for BCE loss
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val[:, None], dtype=torch.float32)


In [None]:
input_size = len(features)
hidden_size = 50
num_layers = 2
dropout = 0.2

model = LSTMModel(input_size, hidden_size, num_layers, dropout)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
num_epochs = 10
batch_size = 64

for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]
        
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # Validation loss
    val_outputs = model(X_val_tensor)
    val_loss = criterion(val_outputs, y_val_tensor)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')
