### Imports

In [14]:
import pandas as pd
import kagglehub
from sklearn.preprocessing import StandardScaler

# MERGING

In [15]:
# Download latest version
path = kagglehub.dataset_download("sujaykapadnis/nfl-stadium-attendance-dataset")

# Load the dataframe from the file
def load_data(name):
    return pd.read_csv(f"{path}/{name}")

attendance_df = load_data("attendance.csv")
standings_df = load_data("standings.csv")
games_df = load_data("games.csv")

attendance_weekly_df = attendance_df[['team', 'team_name', 'year', 'week', 'weekly_attendance']]

attendance_df = attendance_df.drop(columns=['weekly_attendance'])

### ATTENDANCE
# year was shitted by one year, so that last years standings will have influence on attendance
attendance_df['year'] = attendance_df['year'] + 1

attendance_df = attendance_df.merge(attendance_weekly_df, on=['team', 'team_name', 'year', 'week'], how='left')


### STANDINGS
# similar like above for standings
standings_df['year'] = standings_df['year'] + 1

attendance_standings_df = pd.merge(attendance_df, standings_df, on=['team', 'team_name', 'year'])

attendance_standings_df['team_name'] = attendance_standings_df['team'] + ' ' + attendance_standings_df['team_name']

### GAMES
games_df = games_df[games_df['week'].str.isnumeric()]
games_df['week'] = games_df['week'].astype(int)

### MERGE

df1 = attendance_standings_df.merge(games_df, left_on=['year', 'week', 'team_name'], right_on=['year', 'week', 'home_team'], how='inner')
df2 = attendance_standings_df.merge(games_df, left_on=['year', 'week', 'team_name'], right_on=['year', 'week', 'away_team'], how='inner')

df = df1.merge(df2, on=['year', 'week', 'home_team', 'away_team'], how='inner', suffixes=('_home', '_away'))

df = df.drop(columns=['home_team', 'away_team', 'weekly_attendance_away', 'team_away'])
df = df.rename(columns={'team_home': 'city'})



In [16]:
df.to_csv("merged_unprocessed.csv", index=False)

# PREPROCESS

In [17]:
duplicated_columns = ['winner', 'tie', 'day', 'date', 'time', 'pts_win', 'pts_loss', 'yds_win', 'turnovers_win', 'yds_loss', 'turnovers_loss', 'home_team_name', 'home_team_city', 'away_team_name', 'away_team_city', 'loss']
duplicated_columns_away = list(map(lambda x: x + '_away', duplicated_columns))
duplicated_columns_home = list(map(lambda x: x + '_home', duplicated_columns))

duplicated_columns_mapping = dict(zip(duplicated_columns_home, duplicated_columns))

df = df.drop(columns=duplicated_columns_away)
df = df.rename(columns=duplicated_columns_mapping)

columns_to_drop = ['total_home', 'away_home', 'date', 'time', 'city']

columns_to_rename = {
    'home_home': 'home_attendance_last_year_home',
    'away_home': 'away_attendance_last_year_home',
    'home_away': 'home_attendance_last_year_away',
    'away_away': 'away_attendance_last_year_away',
}

df = df.drop(columns=columns_to_drop)
df = df.rename(columns=columns_to_rename)
df = df.replace({ 'Playoffs': 1, 'No Playoffs': 0, 'Won Superbowl': 1, 'No Superbowl': 0 })

df[['playoffs_away', 'sb_winner_away', 'playoffs_home', 'sb_winner_home']] = df[['playoffs_away', 'sb_winner_away', 'playoffs_home', 'sb_winner_home']].astype(int)


season_stats_df = pd.DataFrame(columns=['year', 'week', 'team_name', 'points', 'yards', 'turnovers', 'win', 'loss', 'tie'], dtype=int)
season_stats_df['team_name'] = season_stats_df['team_name'].astype(str)

for index, row in df.iterrows():
    winning_team = row['winner']    
    losing_team = row['team_name_away'] if winning_team == row['team_name_home'] else row['team_name_home']

    for team in [winning_team, losing_team]:
        season_stats_df.loc[len(season_stats_df)] = {
            'year': row['year'],
            'week': row['week'],
            'team_name': team,
            'points': row['pts_win'] if team == winning_team else row['pts_loss'],
            'yards': row['yds_win'] if team == winning_team else row['yds_loss'],
            'turnovers': row['turnovers_win'] if team == winning_team else row['turnovers_loss'],
            'win': 1 if team == winning_team and pd.isna(row['tie']) else 0,
            'loss': 1 if team == losing_team and pd.isna(row['tie']) else 0,
            'tie': 1 if not pd.isna(row['tie'])  else 0
        }

min_year = season_stats_df['year'].min()
max_year = season_stats_df['year'].max()
teams = season_stats_df['team_name'].unique()

# Fill in missing weeks with 0s
for team in teams:
    for year in range(min_year, max_year + 1):
        for week in range(1, 18):
            if len(season_stats_df[(season_stats_df['team_name'] == team) & (season_stats_df['year'] == year) & (season_stats_df['week'] == week)]) == 0:
                season_stats_df.loc[len(season_stats_df)] = {
                    'year': year,
                    'week': week,
                    'team_name': team,
                    'points': 0,
                    'yards': 0,
                    'turnovers': 0,
                    'win': 0,
                    'loss': 0,
                    'tie': 0
                }


# Group by years, and calculate running totals
season_stats_df = season_stats_df.sort_values(by=['team_name', 'year', 'week'])
season_stats_df['points'] = season_stats_df['points'].astype(int)
season_stats_df['yards'] = season_stats_df['yards'].astype(int)
season_stats_df['turnovers'] = season_stats_df['turnovers'].astype(int)
season_stats_df['win'] = season_stats_df['win'].astype(int)
season_stats_df['loss'] = season_stats_df['loss'].astype(int)
season_stats_df['tie'] = season_stats_df['tie'].astype(int)

season_stats_df['points'] = season_stats_df.groupby(['team_name', 'year'])['points'].cumsum()
season_stats_df['yards'] = season_stats_df.groupby(['team_name', 'year'])['yards'].cumsum()
season_stats_df['turnovers'] = season_stats_df.groupby(['team_name', 'year'])['turnovers'].cumsum()
season_stats_df['win'] = season_stats_df.groupby(['team_name', 'year'])['win'].cumsum()
season_stats_df['loss'] = season_stats_df.groupby(['team_name', 'year'])['loss'].cumsum()
season_stats_df['tie'] = season_stats_df.groupby(['team_name', 'year'])['tie'].cumsum()


season_stats_df['week'] = season_stats_df['week'] + 1
# season_stats_df = season_stats_df[season_stats_df['week'] <= 17]

for index, row in season_stats_df[season_stats_df['week'] == 18].iterrows():
    season_stats_df.loc[index] = {
        'year': row['year'],
        'week': 1,
        'team_name': row['team_name'],
        'points': 0,
        'yards': 0,
        'turnovers': 0,
        'win': 0,
        'loss': 0,
        'tie': 0
    }

season_stats_df = season_stats_df.sort_values(by=['team_name', 'year', 'week'])
season_stats_df[(season_stats_df['team_name'] == 'Arizona Cardinals')][season_stats_df['year'] == 2019]
games_columns_to_delete = ['winner', 'tie', 'day', 'pts_win', 'pts_loss', 'yds_win', 'turnovers_win', 'yds_loss', 'turnovers_loss', 'home_team_name', 'home_team_city', 'away_team_name', 'away_team_city']
df = df.drop(columns=games_columns_to_delete)

season_stats_df = season_stats_df.rename(columns={ 'team_name': 'team_name_home' })
df = df.merge(season_stats_df, on=['team_name_home', 'year', 'week'], how='inner', suffixes=['', '_home'])

season_stats_df = season_stats_df.rename(columns={ 'team_name_home': 'team_name_away' })
df = df.merge(season_stats_df, on=['team_name_away', 'year', 'week'], how='inner', suffixes=['', '_away'])

  df = df.replace({ 'Playoffs': 1, 'No Playoffs': 0, 'Won Superbowl': 1, 'No Superbowl': 0 })
  season_stats_df[(season_stats_df['team_name'] == 'Arizona Cardinals')][season_stats_df['year'] == 2019]


### Dataset Description

The table below describes the columns resulting from the merging and preprocessing of NFL game data. These features will be used to train a Random Forest regressor to predict home game attendance.

| Column Name                    | Description                                                                 |
|-------------------------------|-----------------------------------------------------------------------------|
| `team_name_home`              | Name of the home team                                                       |
| `team_name_away`              | Name of the away team                                                       |
| `year`                        | Year the game was played                                                    |
| `week`                        | Week of the season (1–17 regular season)                                    |
| `weekly_attendance_home`      | **Target variable** – Number of attendees at the home game                  |
| `points_for_home`             | Total points scored by the home team throughout the season                  |
| `points_against_home`         | Total points allowed by the home team throughout the season                 |
| `points_differential_home`    | Point difference for the home team across the season                        |
| `simple_rating_home`          | Home team's overall performance rating, factoring in strength of schedule  |
| `margin_of_victory_home`      | Average margin of victory for the home team                                 |
| `wins_home`                   | Total wins by the home team                                                 |
| `offensive_ranking_home`      | Offensive performance rank of the home team                                 |
| `defensive_ranking_home`      | Defensive performance rank of the home team                                 |
| `points_for_away`             | Total points scored by the away team throughout the season                  |
| `points_against_away`         | Total points allowed by the away team throughout the season                 |
| `points_differential_away`    | Point difference for the away team across the season                        |
| `simple_rating_away`          | Away team's overall performance rating                                      |
| `wins_away`                   | Total wins by the away team                                                 |
| `offensive_ranking_away`      | Offensive performance rank of the away team                                 |
| `defensive_ranking_away`      | Defensive performance rank of the away team                                 |

**Note**: Points and performance stats are aggregated across the season for both home and away teams. These variables are useful in understanding the potential drivers of attendance trends.


## Creating Attendance IDs for Splitting the Data into Train, Validation and Test for DBRepo

In [18]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


df['attendance_id'] = df.index


cols = ['attendance_id'] + [col for col in df.columns if col != 'attendance_id']
df = df[cols]


In [19]:
df.to_csv('merged_preprocessed.csv', index=False)

In [20]:
from dbrepo.RestClient import RestClient
from dotenv import load_dotenv
import os

load_dotenv() 
password = os.getenv("DBREPO_PASSWORD")

client = RestClient(
    endpoint="https://test.dbrepo.tuwien.ac.at",
    username="emilp-tuwien",
    password=password
)
training = client.get_identifier_data(identifier_id="56171866-21d8-4b89-a44c-a3044bf2d43d")
validation = client.get_identifier_data(identifier_id="e336829d-fefb-4a09-95f9-0196ee3fc194")
test =  client.get_identifier_data(identifier_id="a8b213d1-c66b-417d-8fc0-aa2bb9393ad2")


In [21]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3365 entries, 0 to 3364
Data columns (total 45 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   attendance_id                   3365 non-null   object
 1   team_name_home                  3365 non-null   object
 2   year                            3365 non-null   object
 3   home_attendance_last_year_home  3365 non-null   object
 4   week                            3365 non-null   object
 5   weekly_attendance_home          3365 non-null   object
 6   wins_home                       3365 non-null   object
 7   loss                            3365 non-null   object
 8   points_for_home                 3365 non-null   object
 9   points_against_home             3365 non-null   object
 10  points_differential_home        3365 non-null   object
 11  margin_of_victory_home          3365 non-null   object
 12  strength_of_schedule_home       3365 non-null   

# Model

We can now begin building the Random Forest model, but first, the categorical features need to be one-hot encoded.

In [22]:

training = pd.get_dummies(training, columns=['team_name_home', 'team_name_away'])
validation = pd.get_dummies(validation, columns=['team_name_home', 'team_name_away'])
test = pd.get_dummies(test, columns=['team_name_home', 'team_name_away'])


Numerical values have to be scaled

In [23]:
target = 'weekly_attendance_home'

# Separate features and labels
X_train, y_train = training.drop(columns=[target]), training[target]
X_val, y_val = validation.drop(columns=[target]), validation[target]
X_test, y_test = test.drop(columns=[target]), test[target]

scaler = StandardScaler()
scaler.fit(X_train)

### SCALING X 

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


### SCALING Y 
target_scaler = StandardScaler()
target_scaler.fit(y_train.values.reshape(-1, 1))

y_train_scaled = target_scaler.transform(y_train.values.reshape(-1, 1)).flatten()
y_val_scaled = target_scaler.transform(y_val.values.reshape(-1, 1)).flatten()
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

ValueError: could not convert string to float: 'false'