In [1]:
# Import package for data manipulation
import pandas as pd
import numpy as np

In [2]:
# File location for data
project_folder = r'C:\Users\giova\Documents\School\Classes\Spring 2025\Capstone Project\Baseball2024'
data_folder_loc = r'C:\Users\giova\Documents\School\Classes\Spring 2025\Capstone Project\Data'

baseball_file = r'\baseball_2024.csv'

In [3]:
# Read data
baseball_df = pd.read_csv(data_folder_loc + baseball_file)

In [4]:
# Different type of columns to drop

# Id columns
# Keep GRMCONTACTID for mixed effects logistic regression
id_columns = ['Unnamed: 0', 'GRMCONTACTID_FANCOPY', 'EVENTCODE', 'DIMCUSTOMERID', 'ETL_ROW_HASH', 'ETL_UPDATED_DATE', 'TENANTID', 'CRM_PRIMARY_CRM_ID', 'ETL_SYNC_DELTAHASHKEY', 'Fan Status']

# Columns with uniform data
uniform_columns = ['SEASONNAME', 'SEASONHEADERNAME', 'SEASONYEAR', 'ARENANAME', 'ISSALEABLE', 'ISSOLD', 'TENANT', 'TICKETING_RESOLD_SEASON_VALUE', 'TICKETING_RESOLD_NET_GAIN', 'TICKETING_RESOLD_LIFETIME_VALUE', 'EMAIL_VALID_EMAILADDRESS']

# Columns with all nulls
null_columns = ['TICKETING_GAMES_FORWARDED', 'TICKETING_MOBILE_SCANNED_GAMES', 'TICKETING_INFERRED_BROKER_STATUS', 'EMAIL_SUBSCRIPTIONS', 'EMAIL_IS_OPTED_IN', 'CRM_ACCOUNT_OWNER', 'CRM_LAST_ACTIVITY_DATE', 'CRM_HAS_OPEN_OPPORTUNITY', 'CRM_OPPORTUNITY_LAST_MODIFIED']

# Special columns
# RECENCY_SCORE, FREQUENCY_SCORE are almost all uniform (2-3 rows not the same)
# RFM_SCORE, PREV_RFM_SCORE are a conglomeration of recency, monetary, and frequency scores. Do not need
additional_drop_columns = ['RECENCY_SCORE', 'FREQUENCY_SCORE', 'RFM_SCORE', 'PREV_RFM_SCORE', 'FAN_CREATEDATE', 'FAN_UPDATEDDATE']

# Dates do not have any individual meetings
no_individual_meaning = ['TICKETING_LAST_EVENT_ATTENDED', 'TICKETING_NEXT_EVENT_PURCHASED']

# Combine all types of columns to drop to a single list
drop_columns = id_columns + uniform_columns + null_columns + additional_drop_columns + no_individual_meaning

# Drop columns
baseball_df = baseball_df.drop(columns=drop_columns)

In [5]:
# Impute NA values

# 1. Fill in NAs in ISMOBILE with 0, caused by not opening mobile
baseball_df['ISMOBILE'] = baseball_df['ISMOBILE'].fillna(0)

# 2. Fill in NAs in RESOLDTOTALAMOUNT with 0, caused by tickets not being resold
baseball_df['RESOLDTOTALAMOUNT'] = baseball_df['RESOLDTOTALAMOUNT'].fillna(0)

# 3. Fill in NAs in DONATION_CURRENT_DONOR WITH 0. USER NOT A CURRENT DONOR
baseball_df['DONATION_CURRENT_DONOR'] = baseball_df['DONATION_CURRENT_DONOR'].fillna(0)

# 4. Fill in NAs in merchandise columns with 0. Did not buy merchandise.
columns_to_fill = [
    'MERCH_QUANTITY_30DAYS', 'MERCH_TOTALSPENT_30DAYS',
    'MERCH_QUANTITY_90DAYS', 'MERCH_TOTALSPENT_90DAYS',
    'MERCH_QUANTITY_365DAYS', 'MERCH_TOTALSPENT_365DAYS',
    'MERCH_QUANTITY_LIFETIME', 'MERCH_TOTALSPENT_LIFETIME'
]
baseball_df[columns_to_fill] = baseball_df[columns_to_fill].fillna(0)

# 5. Fill in NAs in prev fan stage columns with 'None'
# Fill null values with 'None' for specified columns
baseball_df['PREV_FAN_JOURNEY_STAGE'] = baseball_df['PREV_FAN_JOURNEY_STAGE'].fillna('None')
baseball_df['PREV_FAN_PARENT_GROUP'] = baseball_df['PREV_FAN_PARENT_GROUP'].fillna('None')

In [6]:
# Feature Engineering

# 1a. Create new column of people who have not donated
baseball_df['HAS_DONATED'] = baseball_df['DONATION_FIRST_DONATION'].notnull().astype(int)

# 1b. Create new column based on recency of first donation
baseball_df['DAYS_SINCE_FIRST_DONATION'] = (
    (pd.to_datetime('today') - pd.to_datetime(baseball_df['DONATION_FIRST_DONATION'])).dt.days
).astype('Int64')
baseball_df['DAYS_SINCE_FIRST_DONATION'] = baseball_df['DAYS_SINCE_FIRST_DONATION'].fillna(-1)

# Removed 1C because it is high correlation with 1B
# 1c. Create new column based on recency of last donation
baseball_df['DAYS_SINCE_LAST_DONATION'] = (
    (pd.to_datetime('today') - pd.to_datetime(baseball_df['DONATION_LAST_DONATION'])).dt.days
).astype('Int64')
baseball_df['DAYS_SINCE_LAST_DONATION'] = baseball_df['DAYS_SINCE_LAST_DONATION'].fillna(-1)

# Drop donation date columns (date is not used in column)
baseball_df = baseball_df.drop(columns=['DONATION_LAST_DONATION', 'DONATION_FIRST_DONATION'])

  (pd.to_datetime('today') - pd.to_datetime(baseball_df['DONATION_FIRST_DONATION'])).dt.days
  (pd.to_datetime('today') - pd.to_datetime(baseball_df['DONATION_LAST_DONATION'])).dt.days


In [7]:
# 2a. Create new column based on the response time of last email sent 
baseball_df['EMAIL_OPEN_TIME_DIFF'] = (
    pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_OPEN']) - 
    pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_SENT'])
)
# Replace nulls (NAs) with -1 days
baseball_df['EMAIL_OPEN_TIME_DIFF'] = baseball_df['EMAIL_OPEN_TIME_DIFF'].dt.days.fillna(-1).astype(int)
# Replace dates where sent is less than open by date. Where user opened a non-recent email
baseball_df.loc[baseball_df['EMAIL_OPEN_TIME_DIFF'] < 0, 'EMAIL_OPEN_TIME_DIFF'] = -1

# 2b. Create new column based on response of last email sent
baseball_df['HAS_OPENED_EMAIL'] = baseball_df['EMAIL_OPEN_TIME_DIFF'].apply(lambda x: 0 if x == -1 else 1)

# 2c. Create new column based on the seasonality of when email was sent 
baseball_df['EMAIL_SENT_MONTH'] = pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_SENT']).dt.month
baseball_df['EMAIL_SENT_QUARTER'] = pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_SENT']).dt.quarter
baseball_df['EMAIL_SENT_MONTH'] = baseball_df['EMAIL_SENT_MONTH'].fillna(0)
baseball_df['EMAIL_SENT_QUARTER'] = baseball_df['EMAIL_SENT_QUARTER'].fillna(0)

# Drop email date columns
baseball_df = baseball_df.drop(columns=['EMAIL_LAST_EMAIL_SENT', 'EMAIL_LAST_EMAIL_OPEN'])

  pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_OPEN']) -
  pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_SENT'])
  baseball_df['EMAIL_SENT_MONTH'] = pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_SENT']).dt.month
  baseball_df['EMAIL_SENT_QUARTER'] = pd.to_datetime(baseball_df['EMAIL_LAST_EMAIL_SENT']).dt.quarter


In [8]:
# Convert to datetime
baseball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'] = pd.to_datetime(baseball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'], errors='coerce')
baseball_df['TICKETING_LAST_TICKET_PURCHASE'] = pd.to_datetime(baseball_df['TICKETING_LAST_TICKET_PURCHASE'], errors='coerce')

# 3b. Create new column based on recency of first purchase
baseball_df['DAYS_SINCE_FIRST_PURCHASE'] = (pd.to_datetime('today') - baseball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE']).dt.days.fillna(-1).astype(int)

# 3c. Create new column based on recency of last purchase
baseball_df['DAYS_SINCE_LAST_PURCHASE'] = (pd.to_datetime('today') - baseball_df['TICKETING_LAST_TICKET_PURCHASE']).dt.days.fillna(-1).astype(int)

# 3d. Create new column based on time span between first and last purchase
baseball_df['DAYS_BETWEEN_FIRSTLAST_PURCHASE'] = (baseball_df['TICKETING_LAST_TICKET_PURCHASE'] - baseball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE']).dt.days.fillna(-1).astype(int)

# Drop ticket date columns
baseball_df = baseball_df.drop(columns=['TICKETING_FIRST_KNOWN_TICKET_PURCHASE', 'TICKETING_LAST_TICKET_PURCHASE'])

  baseball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'] = pd.to_datetime(baseball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'], errors='coerce')
  baseball_df['TICKETING_LAST_TICKET_PURCHASE'] = pd.to_datetime(baseball_df['TICKETING_LAST_TICKET_PURCHASE'], errors='coerce')


In [9]:
# 4a. Create new column based on recency of last purchase
baseball_df['DAYS_SINCE_LAST_PURCHASE'] = (
    pd.to_datetime('today') - pd.to_datetime(baseball_df['MERCH_DATE_OF_LAST_PURCHASE'])
).dt.days

# Handle null values by filling with -1 (or another placeholder)
baseball_df['DAYS_SINCE_LAST_PURCHASE'] = baseball_df['DAYS_SINCE_LAST_PURCHASE'].fillna(-1).astype(int)

# 4b. Create new column based on if user has made a purchase
baseball_df['HAS_MADE_PURCHASE'] = baseball_df['DAYS_SINCE_LAST_PURCHASE'].apply(lambda x: 0 if x == -1 else 1).astype(int)

# Drop merch date column
baseball_df = baseball_df.drop(columns=['MERCH_DATE_OF_LAST_PURCHASE'])

  pd.to_datetime('today') - pd.to_datetime(baseball_df['MERCH_DATE_OF_LAST_PURCHASE'])


In [10]:
# 5. CREATED NEW COLUMN BASED ON SEATING
# Define seating categories based on section and row
# Define seating categories based on section
def assign_seating_category(row):
    section = str(row['SECTIONNAME']).strip()

    # Black Seats
    if section in ['1']:
        return 'Cyan'

    elif section in ['2', '3', '21', '22', '23', '24', '25', '26', '27', '28']:
        return 'Black'
    
    # Garnet Seats
    elif section in ['4', '5', '6', '7', '17', '18', '19', '20']:
        return 'Garnet'

    # Gold Seats
    elif section in ['8', '9', '10', '11', '12', '13', '14', '15', '16']:
        return 'Gold'

    elif section.startswith('GA'):
        return 'Not Premium'

    elif section.startswith('OF'):
        return 'Not Premium'

    elif section.startswith('BOX'):
        return 'Not Premium'

    elif section.startswith('UP'):
        return 'Premium'

    elif section.startswith('SUITE'):
        return 'Premium'

    elif section.startswith('PTAB'):
        return 'Premium'

    elif section.startswith('PPD'):
        return 'Premium'

    elif section.startswith('CLUB'):
        return 'Premium'

    elif section.startswith('CLB'):
        return 'Premium'
        
    elif section.startswith('CK'):
        return 'Premium'

    elif section.startswith('CYNPOR'):
        return 'Premium'
    
    return 'Other'

# Apply the function to create the new column
baseball_df['SEATING'] = baseball_df.apply(assign_seating_category, axis=1)

In [11]:
# 7. Created new engagement column
# Mapping dictionary
engagement_mapping = {
    'Devoted': 5,
    'Committed': 4,
    'Affiliated': 3,
    'Invested': 2,
    'Observer': 1
}

# Map the engagement_Level column to numerical values
baseball_df['ENGAGEMENT'] = baseball_df['FAN_JOURNEY_STAGE'].map(engagement_mapping)

In [12]:
# Check what rows are null
pd.set_option('display.max_rows', None)
baseball_df.isnull().sum()

GRMCONTACTID                           0
EVENTNAME                              0
SECTIONNAME                            0
ROWNAME                                0
SEAT                                   0
REVENUETOTAL                           0
PLANCODE                               0
ISMOBILE                               0
RESOLDTOTALAMOUNT                      0
ISATTENDED                             0
ISRESOLD                               0
FAN_EMAIL_MARKETABLE                   0
FAN_PHONE_MARKETABLE                   0
FAN_POSTAL_MARKETABLE                  0
FAN_UNIQUE_SOURCESYSTEM_COUNT          0
FAN_INITIAL_LEAD_SOURCE                0
FAN_LAST_LEAD_SOURCE                   0
TICKETING_CURRENTYEARSTM               0
TICKETING_PREVSEASONSTM                0
TICKETING_STM_TENURE                   0
TICKETING_GAMES_SCANNED                0
TICKETING_TICKETS_SCANNED              0
TICKETING_GAMES_SOLD_SECONDARY         0
TICKETING_GAMES_PURCHASED_SECONDARY    0
TICKETING_ATTEND

In [13]:
# Drop these columns because data is new and have not been calculated
baseball_df = baseball_df.dropna()

In [14]:
# Save data
baseball_df.to_csv(project_folder + r'\cleaned_baseball_2024.csv', index=False)