In [1]:
# Import package for data manipulation
import pandas as pd
import numpy as np

In [2]:
# File location for data
project_folder = r'C:\Users\giova\Documents\School\Classes\Spring 2025\Capstone Project\WomenBasketball2024'
data_folder_loc = r'C:\Users\giova\Documents\School\Classes\Spring 2025\Capstone Project\Data'

women_basketball_file = r'\women_basketball_2024.csv'

In [3]:
# Read data
women_basketball_df = pd.read_csv(data_folder_loc + women_basketball_file)

In [4]:
# Different type of columns to drop

# Id columns
# Keep GRMCONTACTID for mixed effects logistic regression
id_columns = ['Unnamed: 0', 'GRMContactid', 'GRMCONTACTID_FANCOPY', 'EVENTCODE', 'DIMCUSTOMERID', 'ETL_ROW_HASH', 'ETL_UPDATED_DATE', 'TENANTID', 'CRM_PRIMARY_CRM_ID', 'ETL_SYNC_DELTAHASHKEY', 'Fan Status']

# Columns with uniform data
uniform_columns = ['SEASONNAME', 'SEASONHEADERNAME', 'SEASONYEAR', 'ARENANAME', 'ISSALEABLE', 'ISSOLD', 'TENANT', 'TICKETING_RESOLD_SEASON_VALUE', 'TICKETING_RESOLD_NET_GAIN', 'TICKETING_RESOLD_LIFETIME_VALUE', 'EMAIL_VALID_EMAILADDRESS']

# Columns with all nulls
null_columns = ['TICKETING_GAMES_FORWARDED', 'TICKETING_MOBILE_SCANNED_GAMES', 'TICKETING_INFERRED_BROKER_STATUS', 'EMAIL_SUBSCRIPTIONS', 'EMAIL_IS_OPTED_IN', 'CRM_ACCOUNT_OWNER', 'CRM_LAST_ACTIVITY_DATE', 'CRM_HAS_OPEN_OPPORTUNITY', 'CRM_OPPORTUNITY_LAST_MODIFIED']

# Special columns
# RECENCY_SCORE, FREQUENCY_SCORE are almost all uniform (2-3 rows not the same)
# RFM_SCORE, PREV_RFM_SCORE are a conglomeration of recency, monetary, and frequency scores. Do not need
additional_drop_columns = ['RECENCY_SCORE', 'FREQUENCY_SCORE', 'RFM_SCORE', 'PREV_RFM_SCORE', 'FAN_CREATEDATE', 'FAN_UPDATEDDATE']

# Dates do not have any individual meetings
no_individual_meaning = ['TICKETING_LAST_EVENT_ATTENDED', 'TICKETING_NEXT_EVENT_PURCHASED']

# Combine all types of columns to drop to a single list
drop_columns = id_columns + uniform_columns + null_columns + additional_drop_columns + no_individual_meaning

# Drop columns
women_basketball_df = women_basketball_df.drop(columns=drop_columns)

In [5]:
# Impute NA values

# 1. Fill in NAs in ISMOBILE with 0, caused by not opening mobile
women_basketball_df['ISMOBILE'] = women_basketball_df['ISMOBILE'].fillna(0)

# 2. Fill in NAs in RESOLDTOTALAMOUNT with 0, caused by tickets not being resold
women_basketball_df['RESOLDTOTALAMOUNT'] = women_basketball_df['RESOLDTOTALAMOUNT'].fillna(0)

# 3. Fill in NAs in DONATION_CURRENT_DONOR WITH 0. USER NOT A CURRENT DONOR
women_basketball_df['DONATION_CURRENT_DONOR'] = women_basketball_df['DONATION_CURRENT_DONOR'].fillna(0)

# 4. Fill in NAs in merchandise columns with 0. Did not buy merchandise.
columns_to_fill = [
    'MERCH_QUANTITY_30DAYS', 'MERCH_TOTALSPENT_30DAYS',
    'MERCH_QUANTITY_90DAYS', 'MERCH_TOTALSPENT_90DAYS',
    'MERCH_QUANTITY_365DAYS', 'MERCH_TOTALSPENT_365DAYS',
    'MERCH_QUANTITY_LIFETIME', 'MERCH_TOTALSPENT_LIFETIME'
]
women_basketball_df[columns_to_fill] = women_basketball_df[columns_to_fill].fillna(0)

# 5. Fill in NAs in prev fan stage columns with 'None'
# Fill null values with 'None' for specified columns
women_basketball_df['PREV_FAN_JOURNEY_STAGE'] = women_basketball_df['PREV_FAN_JOURNEY_STAGE'].fillna('None')
women_basketball_df['PREV_FAN_PARENT_GROUP'] = women_basketball_df['PREV_FAN_PARENT_GROUP'].fillna('None')

In [6]:
# Feature Engineering

# 1a. Create new column of people who have not donated
women_basketball_df['HAS_DONATED'] = women_basketball_df['DONATION_FIRST_DONATION'].notnull().astype(int)

# 1b. Create new column based on recency of first donation
women_basketball_df['DAYS_SINCE_FIRST_DONATION'] = (
    (pd.to_datetime('today') - pd.to_datetime(women_basketball_df['DONATION_FIRST_DONATION'])).dt.days
).astype('Int64')
women_basketball_df['DAYS_SINCE_FIRST_DONATION'] = women_basketball_df['DAYS_SINCE_FIRST_DONATION'].fillna(-1)

# Removed 1C because it is high correlation with 1B
# 1c. Create new column based on recency of last donation
women_basketball_df['DAYS_SINCE_LAST_DONATION'] = (
    (pd.to_datetime('today') - pd.to_datetime(women_basketball_df['DONATION_LAST_DONATION'])).dt.days
).astype('Int64')
women_basketball_df['DAYS_SINCE_LAST_DONATION'] = women_basketball_df['DAYS_SINCE_LAST_DONATION'].fillna(-1)

# Drop donation date columns (date is not used in column)
women_basketball_df = women_basketball_df.drop(columns=['DONATION_LAST_DONATION', 'DONATION_FIRST_DONATION'])

  (pd.to_datetime('today') - pd.to_datetime(women_basketball_df['DONATION_FIRST_DONATION'])).dt.days
  (pd.to_datetime('today') - pd.to_datetime(women_basketball_df['DONATION_LAST_DONATION'])).dt.days


In [7]:
# 2a. Create new column based on the response time of last email sent 
women_basketball_df['EMAIL_OPEN_TIME_DIFF'] = (
    pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_OPEN']) - 
    pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_SENT'])
)
# Replace nulls (NAs) with -1 days
women_basketball_df['EMAIL_OPEN_TIME_DIFF'] = women_basketball_df['EMAIL_OPEN_TIME_DIFF'].dt.days.fillna(-1).astype(int)
# Replace dates where sent is less than open by date. Where user opened a non-recent email
women_basketball_df.loc[women_basketball_df['EMAIL_OPEN_TIME_DIFF'] < 0, 'EMAIL_OPEN_TIME_DIFF'] = -1

# 2b. Create new column based on response of last email sent
women_basketball_df['HAS_OPENED_EMAIL'] = women_basketball_df['EMAIL_OPEN_TIME_DIFF'].apply(lambda x: 0 if x == -1 else 1)

# 2c. Create new column based on the seasonality of when email was sent 
women_basketball_df['EMAIL_SENT_MONTH'] = pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_SENT']).dt.month
women_basketball_df['EMAIL_SENT_QUARTER'] = pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_SENT']).dt.quarter
women_basketball_df['EMAIL_SENT_MONTH'] = women_basketball_df['EMAIL_SENT_MONTH'].fillna(0)
women_basketball_df['EMAIL_SENT_QUARTER'] = women_basketball_df['EMAIL_SENT_QUARTER'].fillna(0)

# Drop email date columns
women_basketball_df = women_basketball_df.drop(columns=['EMAIL_LAST_EMAIL_SENT', 'EMAIL_LAST_EMAIL_OPEN'])

  pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_OPEN']) -
  pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_SENT'])
  women_basketball_df['EMAIL_SENT_MONTH'] = pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_SENT']).dt.month
  women_basketball_df['EMAIL_SENT_QUARTER'] = pd.to_datetime(women_basketball_df['EMAIL_LAST_EMAIL_SENT']).dt.quarter


In [8]:
# Convert to datetime
women_basketball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'] = pd.to_datetime(women_basketball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'], errors='coerce')
women_basketball_df['TICKETING_LAST_TICKET_PURCHASE'] = pd.to_datetime(women_basketball_df['TICKETING_LAST_TICKET_PURCHASE'], errors='coerce')

# 3a. Create new column based on if first/last tickets purchased
# women_basketball_df['HAS_FIRST_PURCHASE'] = women_basketball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'].notna().astype(int)
# women_basketball_df['HAS_LAST_PURCHASE'] = women_basketball_df['TICKETING_LAST_TICKET_PURCHASE'].notna().astype(int)
# Removed this as most users have a first/last ticket purchased. Only 2 does not have tickets purchased

# 3b. Create new column based on recency of first purchase
women_basketball_df['DAYS_SINCE_FIRST_PURCHASE'] = (pd.to_datetime('today') - women_basketball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE']).dt.days.fillna(-1).astype(int)

# 3c. Create new column based on recency of last purchase
women_basketball_df['DAYS_SINCE_LAST_PURCHASE'] = (pd.to_datetime('today') - women_basketball_df['TICKETING_LAST_TICKET_PURCHASE']).dt.days.fillna(-1).astype(int)

# 3d. Create new column based on time span between first and last purchase
women_basketball_df['DAYS_BETWEEN_FIRSTLAST_PURCHASE'] = (women_basketball_df['TICKETING_LAST_TICKET_PURCHASE'] - women_basketball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE']).dt.days.fillna(-1).astype(int)

# Drop ticket date columns
women_basketball_df = women_basketball_df.drop(columns=['TICKETING_FIRST_KNOWN_TICKET_PURCHASE', 'TICKETING_LAST_TICKET_PURCHASE'])

  women_basketball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'] = pd.to_datetime(women_basketball_df['TICKETING_FIRST_KNOWN_TICKET_PURCHASE'], errors='coerce')
  women_basketball_df['TICKETING_LAST_TICKET_PURCHASE'] = pd.to_datetime(women_basketball_df['TICKETING_LAST_TICKET_PURCHASE'], errors='coerce')


In [9]:
# 4a. Create new column based on recency of last purchase
women_basketball_df['DAYS_SINCE_LAST_PURCHASE'] = (
    pd.to_datetime('today') - pd.to_datetime(women_basketball_df['MERCH_DATE_OF_LAST_PURCHASE'])
).dt.days

# Handle null values by filling with -1 (or another placeholder)
women_basketball_df['DAYS_SINCE_LAST_PURCHASE'] = women_basketball_df['DAYS_SINCE_LAST_PURCHASE'].fillna(-1).astype(int)

# 4b. Create new column based on if user has made a purchase
women_basketball_df['HAS_MADE_PURCHASE'] = women_basketball_df['DAYS_SINCE_LAST_PURCHASE'].apply(lambda x: 0 if x == -1 else 1).astype(int)

# Drop merch date column
women_basketball_df = women_basketball_df.drop(columns=['MERCH_DATE_OF_LAST_PURCHASE'])

  pd.to_datetime('today') - pd.to_datetime(women_basketball_df['MERCH_DATE_OF_LAST_PURCHASE'])


In [10]:
# 5. Created new column based on seating
# Define seating categories based on section and row
def assign_seating_category(row):
    section = row['SECTIONNAME']
    seat_row = row['ROWNAME']

    # Courtside
    if section in ['101', '109', '118'] and seat_row == 'FL':
        return 'Courtside'
    elif section in ['113', '115', '102', '103', '107', '116', '117'] and seat_row == 'AA':
        return 'Courtside'

    # Premier Seating
    elif section == '105' and seat_row in ['AA', 'BB', 'CC', 'DD', '1', '2', '3', '4', '5', '6', '7', '8']:
        return 'Premier Seating'
    elif section in ['113', '115'] and seat_row in ['BB', 'CC', 'DD']:
        return 'Premier Seating'
    elif section == '114' and seat_row in ['CC', 'DD']:
        return 'Premier Seating'

    # This is personally added. Not in PDF but based on chart
    elif section == '114' and seat_row not in ['CC', 'DD']:
        return 'Lower Level 1 Seating'

    # Lower Level 1 Seating
    elif section in ['104', '106']:
        return 'Lower Level 1 Seating'
    elif section in ['113', '115'] and (seat_row in [str(i) for i in range(1, 28)] or seat_row in ['AA', 'BB', 'CC', 'DD']):
        return 'Lower Level 1 Seating'
    elif section == '105' and seat_row in [str(i) for i in range(9, 26)]:
        return 'Lower Level 1 Seating'

    # Lower Level 2 Seating
    elif section in ['102', '103', '107', '108', '116', '117'] and (seat_row in [str(i) for i in range(1, 28)] or seat_row in ['BB', 'CC', 'DD']):
        return 'Lower Level 2 Seating'
    elif section in ['101', '118'] and (seat_row in [str(i) for i in range(1, 28)] or seat_row in ['AA', 'BB', 'CC', 'DD']):
        return 'Lower Level 2 Seating'

    # Upper Reserved
    elif section in ['206', '207', '208', '209', '210'] or section in ['220', '221', '222', '223', '224']:
        return 'Upper Reserved'

    # Student Section
    elif section in ['109', '110', '111', '112', '216', '217', '218']:
        return 'Upper Reserved'
    
    # Upper Deck General Admission
    elif section.startswith('G'):
        return 'Upper Deck General Admission'

    # Suite Admission
    elif section.startswith('STE'):
        return 'Suite'

    return 'Wheelchair'

# Apply the function to create the new column
women_basketball_df['SEATING'] = women_basketball_df.apply(assign_seating_category, axis=1)

In [11]:
# Create a new column 'MATCH_DATE' based on the event name
def get_match_date(event_name):
    match_dates = {
        'vs. Duke': '12/5/24',
        'vs. Arkansas': '2/20/25',
        'vs. Texas': '1/12/25', 
        'vs. Coppin State': '11/14/24',
        'vs. Florida': '2/13/25', 
        'vs. LSU': '1/24/25',
        'vs. UConn': '2/16/25', 
        'vs. Auburn': '2/2/25', 
        'vs. Kentucky': '3/2/25', 
        'vs. South Florida': '12/15/24',
        'vs. Charleston Southern': '12/19/24',
        'vs. Oklahoma': '1/19/25', 
        'vs. Wofford': '12/29/24', 
        'vs. East Carolina': '11/17/24', 
        'vs. Texas A&M': '1/9/25',
    }
    return match_dates.get(event_name, 'Unknown Date')

# Create a new column 'MATCH_DATE' based on the event name
def get_match_time(event_name):
    match_dates = {
        'vs. Duke': '21:00',
        'vs. Arkansas': '19:00',
        'vs. Texas': '13:00', 
        'vs. Coppin State': '19:00',
        'vs. Florida': '19:00', 
        'vs. LSU': '17:00',
        'vs. UConn': '13:00', 
        'vs. Auburn': '12:00', 
        'vs. Kentucky': '14:00', 
        'vs. South Florida': '14:00',
        'vs. Charleston Southern': '12:00',
        'vs. Oklahoma': '15:00', 
        'vs. Wofford': '14:00', 
        'vs. East Carolina': '14:00', 
        'vs. Texas A&M': '17:00',
    }
    return match_dates.get(event_name, 'Unknown Date')

women_basketball_df['MATCH_DATE'] = women_basketball_df['EVENTNAME'].apply(get_match_date)
women_basketball_df['MATCH_DATE'] = pd.to_datetime(women_basketball_df['MATCH_DATE'], format='%m/%d/%y', errors='coerce')
women_basketball_df['MATCH_DAY'] = women_basketball_df['MATCH_DATE'].dt.day_name()
women_basketball_df['MATCH_TIME'] = women_basketball_df['EVENTNAME'].apply(get_match_time)

In [12]:
# 6a. Created new column on match order
# Define the mapping of events to game order
match_order = {
    'vs. Coppin State': 1,
    'vs. East Carolina': 2,
    'vs. Duke': 3,
    'vs. South Florida': 4,
    'vs. Charleston Southern': 5,
    'vs. Wofford': 6,
    'vs. Texas A&M': 7,
    'vs. Texas': 8,
    'vs. Oklahoma': 9,
    'vs. LSU': 10,
    'vs. Auburn': 11,
    'vs. Florida': 12,
    'vs. UConn': 13,
    'vs. Arkansas': 14,
    'vs. Kentucky': 15
}

# Create a new column 'MATCH_ORDER' based on the order of games
women_basketball_df['MATCH_ORDER'] = women_basketball_df['EVENTNAME'].map(match_order)

# 6b. Created new column based on UofSC ranking before the game
def get_uofsc_rank(event_name):
    uofsc_rank = {
        'vs. UConn': 4,
        'vs. LSU': 2,
        'vs. Auburn': 2,
        'vs. Duke': 3,
        'vs. Texas': 2,
        'vs. Oklahoma': 5,
        'vs. Kentucky': 6,
        'vs. Florida': 4,
        'vs. Arkansas': 6,
        'vs. Wofford': 2,
        'vs. South Florida': 3,
        'vs. East Carolina': 1,
        'vs. Coppin State': 1,
        'vs. Texas A&M': 2,
        'vs. Charleston Southern': 2
    }
    return uofsc_rank.get(event_name, 'Unknown Rank')

women_basketball_df['SC_RANK'] = women_basketball_df['EVENTNAME'].apply(get_uofsc_rank)

# 6c. Created new column on opponent being in top 25 rank before the game
def get_opp_rank(event_name):
    opp_rank = {
        'vs. UConn': 1,
        'vs. LSU': 1,
        'vs. Auburn': 0,
        'vs. Duke': 1,
        'vs. Texas': 1,
        'vs. Oklahoma': 1,
        'vs. Kentucky': 1,
        'vs. Florida': 0,
        'vs. Arkansas': 0,
        'vs. Wofford': 0,
        'vs. South Florida': 0,
        'vs. East Carolina': 0,
        'vs. Coppin State': 0,
        'vs. Texas A&M': 0,
        'vs. Charleston Southern': 0
    }
    return opp_rank.get(event_name, 'Unknown Rank')

women_basketball_df['OPPONENT_RANKED'] = women_basketball_df['EVENTNAME'].apply(get_opp_rank)

In [13]:
# 7. Created new engagement column
# Mapping dictionary
engagement_mapping = {
    'Devoted': 5,
    'Committed': 4,
    'Affiliated': 3,
    'Invested': 2,
    'Observer': 1
}

# Map the engagement_Level column to numerical values
women_basketball_df['ENGAGEMENT'] = women_basketball_df['FAN_JOURNEY_STAGE'].map(engagement_mapping)

In [14]:
# Check what rows are null
pd.set_option('display.max_rows', None)
women_basketball_df.isnull().sum()

EVENTNAME                                0
SECTIONNAME                              0
ROWNAME                                  0
SEAT                                     0
REVENUETOTAL                             0
PLANCODE                                 0
ISMOBILE                                 0
RESOLDTOTALAMOUNT                        0
ISATTENDED                               0
ISRESOLD                                 0
GRMCONTACTID                             0
FAN_EMAIL_MARKETABLE                     0
FAN_PHONE_MARKETABLE                     0
FAN_POSTAL_MARKETABLE                    0
FAN_UNIQUE_SOURCESYSTEM_COUNT            0
FAN_INITIAL_LEAD_SOURCE                  0
FAN_LAST_LEAD_SOURCE                     0
TICKETING_CURRENTYEARSTM               107
TICKETING_PREVSEASONSTM                107
TICKETING_STM_TENURE                   107
TICKETING_GAMES_SCANNED                107
TICKETING_TICKETS_SCANNED              107
TICKETING_GAMES_SOLD_SECONDARY         107
TICKETING_G

In [15]:
# Drop these columns because data is new and have not been calculated
women_basketball_df = women_basketball_df.dropna()

In [16]:
# Save data
women_basketball_df.to_csv(project_folder + r'\cleaned_women_basketball_2024.csv', index=False)