# Generating Marathon Data for SQL Practice

This is a Python Jupyter notebook for creating a full mock SQL database of marathon race results with thousands of records. This script simulates a realistic dataset, including both elite and recreational runners, with varied performance data across multiple marathon events. There are several steps to this:

1. **Generate 'Runners' Table** - Creates a list of 1000 runners, mixing famous elite and athletes and randomly generated recreational runners.
2. **Generate 'Events' Table** - Create data for different marathon events, spanning different years.
3. **Generate 'Results' Table** - Assign finishing times and positions to runners for each event.
4. **Generate 'Sponsors' Table** - Associates elite runners with sponsors.
5. **Generate 'Training Plans' Table** - Assign training plans to a subset of runners.

## Import packages

In [5]:
pip install faker




In [6]:
import random
from faker import Faker
import datetime
import pandas as pd

In [7]:
fake = Faker()

## Define the records for tables and helper functions

In [9]:
# Helper functions
def random_time():
    """Generate a random marathon finish time"""
    hours = random.randint(2, 5)
    minutes = random.randint(0, 59)
    seconds = random.randint(0, 59)
    return f'{hours:02}:{minutes:02}:{seconds:02}'

In [10]:
# Define the number of records
num_runners = 1000
num_events = 12 # 12 for the 6 marathons over 2 years (2012-2023)
num_results = num_runners * num_events
num_sponsors = 100
num_training_plans = 600

In [11]:
# Define the major marathons and their exact dates
marathons = [
    ("Boston Marathon", [
        "2012-04-16", "2013-04-15", "2014-04-21", "2015-04-20", "2016-04-18",
        "2017-04-17", "2018-04-16", "2019-04-15", "2020-10-11", "2021-10-11",
        "2022-04-18", "2023-04-17"
    ]),
    ("London Marathon", [
        "2012-04-22", "2013-04-21", "2014-04-21", "2015-04-26", "2016-04-24",
        "2017-04-23", "2018-04-22", "2019-04-28", "2020-10-04", "2021-10-03",
        "2022-10-02", "2023-04-23"
    ]),
    ("New York City Marathon", [
        "2012-11-04", "2013-11-03", "2014-11-02", "2015-11-01", "2016-11-06",
        "2017-11-05", "2018-11-04", "2019-11-03", "2020-12-13", "2021-11-07",
        "2022-11-06", "2023-11-05"
    ]),
    ("Berlin Marathon", [
        "2012-09-30", "2013-09-29", "2014-09-28", "2015-09-27", "2016-09-25",
        "2017-09-24", "2018-09-16", "2019-09-29", "2020-09-27", "2021-09-26",
        "2022-09-25", "2023-09-24"
    ]),
    ("Chicago Marathon", [
        "2012-10-07", "2013-10-13", "2014-10-12", "2015-10-11", "2016-10-09",
        "2017-10-08", "2018-10-07", "2019-10-13", "2020-10-11", "2021-10-10",
        "2022-10-09", "2023-10-08"
    ]),
    ("Tokyo Marathon", [
        "2012-02-26", "2013-02-24", "2014-02-23", "2015-02-22", "2016-02-28",
        "2017-02-26", "2018-02-25", "2019-03-03", "2020-03-01", "2021-03-07",
        "2022-03-06", "2023-03-05"
    ])
]

In [12]:
# Define predefined training plan descriptions
training_plan_descriptions = [
    "Beginner Marathon Plan: 16-week build-up",
    "Intermediate Marathon Plan: 12-week build-up",
    "Advanced Marathon Plan: 18-week build-up",
    "Elite Marathon Plan: 24-week build-up",
    "5K to Marathon Transition Plan: 20-week build-up",
    "Half-Marathon to Marathon Plan: 14-week build-up",
    "Strength and Conditioning Focus Plan: 10-week build-up",
    "Speed Work Focus Plan: 12-week build-up",
    "Long Distance Endurance Focus Plan: 22-week build-up",
    "Customized Marathon Plan: 16-week build-up"
]

## Generate Tables

In [77]:
# Generate Runners Table
runners = []
for i in range(1, num_runners + 1):
    # Randomly select a category for each runner
    category = random.choice(['Male', 'Female'])
    
    runners.append((
        i,
        fake.first_name(),
        fake.last_name(),
        fake.date_of_birth(minimum_age=18, maximum_age=90),
        category,  # Assign the randomly chosen category
        fake.country()
    ))

In [79]:
unique_countries = set(runner[5] for runner in runners)
print("Unique countries:", unique_countries)

Unique countries: {'Algeria', 'Armenia', 'Vanuatu', 'Comoros', 'American Samoa', 'Dominican Republic', 'Pitcairn Islands', 'Colombia', 'Benin', 'Reunion', 'Libyan Arab Jamahiriya', 'Suriname', 'Andorra', 'Western Sahara', 'Mali', 'Spain', 'Belize', 'Australia', 'Croatia', 'Saudi Arabia', 'Mexico', 'Austria', 'Japan', 'Latvia', 'Sweden', 'Indonesia', 'Tokelau', 'French Southern Territories', 'Antigua and Barbuda', 'Turkmenistan', 'Thailand', 'Saint Pierre and Miquelon', 'Zambia', 'Liberia', 'Hungary', 'Micronesia', 'Nauru', 'Slovakia (Slovak Republic)', 'Israel', 'Tunisia', 'Eritrea', 'Sao Tome and Principe', 'Mongolia', 'Equatorial Guinea', 'Cook Islands', 'Venezuela', 'Vietnam', 'French Polynesia', 'Tuvalu', 'Uruguay', 'Brazil', 'Cuba', 'Norfolk Island', 'Serbia', 'Philippines', 'Gabon', 'Kenya', 'Slovenia', 'Guernsey', 'Georgia', 'Palestinian Territory', 'British Indian Ocean Territory (Chagos Archipelago)', 'Bahamas', 'Belgium', 'Martinique', 'Solomon Islands', 'Guadeloupe', 'Denmar

In [81]:
print(runners)

[(1, 'Jon', 'Garrison', datetime.date(1968, 12, 26), 'Female Wheelchair', 'Egypt'), (2, 'Anita', 'Gross', datetime.date(1952, 3, 5), 'Male', 'South Africa'), (3, 'Michael', 'Hernandez', datetime.date(1992, 9, 20), 'Female Wheelchair', 'Ireland'), (4, 'Timothy', 'Hill', datetime.date(1950, 12, 30), 'Male Wheelchair', 'Cape Verde'), (5, 'Robert', 'Brady', datetime.date(1942, 11, 27), 'Male', 'Qatar'), (6, 'Kenneth', 'Turner', datetime.date(1936, 10, 5), 'Male', 'Mauritania'), (7, 'Matthew', 'Cook', datetime.date(1989, 2, 10), 'Female', 'United Arab Emirates'), (8, 'Derrick', 'Barnes', datetime.date(2000, 4, 17), 'Female', 'Lithuania'), (9, 'Rebecca', 'Wagner', datetime.date(1940, 5, 6), 'Male', 'Zambia'), (10, 'Daniel', 'Olson', datetime.date(1968, 6, 2), 'Female', 'Norway'), (11, 'Carl', 'Collins', datetime.date(1988, 8, 14), 'Female', 'Lebanon'), (12, 'Lisa', 'Fischer', datetime.date(1969, 10, 4), 'Male', 'United Arab Emirates'), (13, 'Daniel', 'Graham', datetime.date(1939, 11, 8), 'Fe

In [65]:
runners_countries = runners[5]
print(runners_countries)

(6, 'James', 'Alvarado', datetime.date(1994, 6, 9), 'American Samoa')


In [16]:
# Generate Events Table
events = []
event_id = 1
for marathon_name, dates in marathons:
    for event_date in dates:
        events.append((
            event_id,
            marathon_name,
            datetime.datetime.strptime(event_date, "%Y-%m-%d").date()  # Ensure event_date is a datetime.date object 
        ))
        event_id += 1

In [17]:
print(events)

[(1, 'Boston Marathon', datetime.date(2012, 4, 16)), (2, 'Boston Marathon', datetime.date(2013, 4, 15)), (3, 'Boston Marathon', datetime.date(2014, 4, 21)), (4, 'Boston Marathon', datetime.date(2015, 4, 20)), (5, 'Boston Marathon', datetime.date(2016, 4, 18)), (6, 'Boston Marathon', datetime.date(2017, 4, 17)), (7, 'Boston Marathon', datetime.date(2018, 4, 16)), (8, 'Boston Marathon', datetime.date(2019, 4, 15)), (9, 'Boston Marathon', datetime.date(2020, 10, 11)), (10, 'Boston Marathon', datetime.date(2021, 10, 11)), (11, 'Boston Marathon', datetime.date(2022, 4, 18)), (12, 'Boston Marathon', datetime.date(2023, 4, 17)), (13, 'London Marathon', datetime.date(2012, 4, 22)), (14, 'London Marathon', datetime.date(2013, 4, 21)), (15, 'London Marathon', datetime.date(2014, 4, 21)), (16, 'London Marathon', datetime.date(2015, 4, 26)), (17, 'London Marathon', datetime.date(2016, 4, 24)), (18, 'London Marathon', datetime.date(2017, 4, 23)), (19, 'London Marathon', datetime.date(2018, 4, 22)),

In [18]:
# Generate Results Table
results = []
for i in range(1, num_results + 1):
    runner_id = random.randint(1, num_runners)
    event_id = ((i - 1) % num_events) + 1
    runner = next(r for r in runners if r[0] == runner_id)
    finish_time = random_time()
    position = random.randint(1, 5000)
    category = random.choice(['Male', 'Female'])
    is_elite = False
    
    # Determine if the runner's finish time is elite
    if category == 'Male' and float(finish_time[:2]) < 2.15:
        is_elite = True
    elif category == 'Female' and float(finish_time[:2]) < 2.30:
        is_elite = True
    elif category == 'Wheelchair' and float(finish_time[:2]) < 1.20:
        is_elite = True

    results.append((
        i,
        event_id,
        runner_id,
        finish_time,
        position,
        category,
        is_elite
    ))

In [19]:
print(results)

[(1, 1, 410, '04:07:38', 1619, 'Male', False), (2, 2, 658, '05:51:08', 2629, 'Male', False), (3, 3, 355, '03:59:15', 3614, 'Male', False), (4, 4, 765, '03:12:22', 536, 'Wheelchair', False), (5, 5, 125, '03:33:58', 1245, 'Male', False), (6, 6, 49, '03:21:24', 1875, 'Female', False), (7, 7, 558, '05:18:12', 2833, 'Wheelchair', False), (8, 8, 91, '02:58:52', 2841, 'Male', True), (9, 9, 290, '04:47:15', 4710, 'Wheelchair', False), (10, 10, 460, '05:16:58', 1640, 'Wheelchair', False), (11, 11, 190, '02:50:56', 1857, 'Male', True), (12, 12, 333, '02:07:38', 1567, 'Male', True), (13, 1, 42, '05:29:33', 825, 'Female', False), (14, 2, 95, '05:02:36', 4642, 'Wheelchair', False), (15, 3, 818, '02:52:29', 1838, 'Female', True), (16, 4, 441, '04:17:10', 4311, 'Male', False), (17, 5, 385, '05:13:49', 1524, 'Female', False), (18, 6, 70, '03:17:30', 2094, 'Male', False), (19, 7, 429, '04:39:25', 1149, 'Female', False), (20, 8, 162, '02:18:19', 1461, 'Female', True), (21, 9, 725, '04:16:49', 605, 'Fema

In [20]:
# Identify elite runners based on the 'results' table
elite_runners = [runner_id for runner_id, _, _, _, _, _, is_elite in results if is_elite]

In [21]:
print(elite_runners)

[8, 11, 12, 15, 20, 23, 40, 42, 48, 54, 59, 66, 77, 94, 99, 101, 108, 121, 126, 130, 131, 132, 140, 141, 142, 143, 147, 157, 158, 161, 163, 164, 169, 174, 175, 177, 193, 202, 206, 208, 211, 231, 238, 240, 241, 242, 243, 259, 260, 276, 279, 280, 284, 285, 288, 291, 309, 312, 314, 316, 318, 319, 321, 333, 339, 343, 347, 360, 364, 365, 369, 370, 376, 377, 384, 392, 404, 416, 421, 428, 441, 443, 444, 445, 456, 466, 474, 494, 496, 505, 515, 524, 527, 538, 545, 556, 557, 560, 562, 563, 568, 569, 570, 575, 594, 595, 597, 604, 613, 620, 621, 628, 631, 641, 644, 651, 652, 659, 660, 662, 664, 682, 683, 692, 696, 700, 705, 710, 715, 718, 720, 722, 732, 736, 737, 742, 744, 745, 746, 767, 768, 772, 773, 783, 792, 799, 814, 830, 835, 841, 847, 849, 856, 857, 861, 862, 867, 880, 889, 903, 905, 913, 914, 918, 919, 953, 957, 962, 966, 968, 971, 974, 988, 989, 999, 1003, 1011, 1012, 1022, 1045, 1061, 1062, 1064, 1072, 1078, 1079, 1089, 1096, 1098, 1100, 1102, 1111, 1119, 1121, 1122, 1129, 1130, 1132, 11

In [22]:
# Generate Sponsors Table
sponsors = [
    "Nike",
    "Adidas",
    "Asics",
    "Saucony",
    "Hoka One One",
    "Brooks",
    "New Balance",
    "Puma",
    "Under Armour",
    "Tracksmith"
]

# Select 50% of elite runners to have sponsors
sponsored_runners = random.sample(elite_runners, len(elite_runners) // 2)




# Generate Sponsors Table
sponsors_data = []
for i, runner_id in enumerate(sponsored_runners, start=1):
    sponsor_name = random.choice(sponsors)
    sponsors_data.append((
        i,
        sponsor_name,
        runner_id
    ))

In [23]:
print(sponsors_data)

[(1, 'Nike', 3065), (2, 'Asics', 6584), (3, 'Nike', 5965), (4, 'Hoka One One', 5440), (5, 'Asics', 4315), (6, 'Puma', 7592), (7, 'Hoka One One', 5758), (8, 'Nike', 5854), (9, 'Hoka One One', 1130), (10, 'Under Armour', 8420), (11, 'Saucony', 5060), (12, 'New Balance', 3615), (13, 'Hoka One One', 141), (14, 'Saucony', 9580), (15, 'Adidas', 7034), (16, 'Adidas', 5501), (17, 'Adidas', 4311), (18, 'Saucony', 8417), (19, 'New Balance', 5872), (20, 'Adidas', 6611), (21, 'Puma', 9971), (22, 'Adidas', 3935), (23, 'Under Armour', 6955), (24, 'Tracksmith', 6872), (25, 'Hoka One One', 1619), (26, 'Under Armour', 3485), (27, 'Brooks', 157), (28, 'Under Armour', 1610), (29, 'Brooks', 5661), (30, 'New Balance', 5479), (31, 'Tracksmith', 8715), (32, 'Nike', 10152), (33, 'Puma', 745), (34, 'Hoka One One', 9943), (35, 'Under Armour', 6808), (36, 'Hoka One One', 4063), (37, 'Under Armour', 7731), (38, 'Saucony', 767), (39, 'Adidas', 2724), (40, 'Asics', 1423), (41, 'Under Armour', 8632), (42, 'Asics', 8

In [24]:
# Predefined list of training plan descriptions
training_plan_durations = {
    "Beginner Marathon Plan: 16-week build-up": 16,
    "Intermediate Marathon Plan: 12-week build-up": 12,
    "Advanced Marathon Plan: 18-week build-up": 18,
    "Elite Marathon Plan: 24-week build-up": 24,
    "5K to Marathon Transition Plan: 20-week build-up": 20,
    "Half-Marathon to Marathon Plan: 14-week build-up": 14,
    "Strength and Conditioning Focus Plan: 10-week build-up": 10,
    "Speed Work Focus Plan: 12-week build-up": 12,
    "Long Distance Endurance Focus Plan: 22-week build-up": 22,
    "Customized Marathon Plan: 16-week build-up": 16
}

In [25]:
# Select 60% of runners to have training plans
all_runner_ids = [runner[0] for runner in runners]
selected_runner_ids = random.sample(all_runner_ids, int(len(all_runner_ids) * 0.60))

In [45]:
# Generate Training Plans Table
training_plans = []
plan_id = 1


for event in events:
    event_id = event[0]
    event_name = event[1]
    event_date = event[2]
    
    for runner_id in selected_runner_ids:
        # Randomly select a training plan description
        plan_description = random.choice(list(training_plan_durations.keys()))
        
        # Get the duration of the training plan in weeks from the dictionary
        plan_length_weeks = training_plan_durations[plan_description]
        
        # Calculate the start date
        start_date = event_date - datetime.timedelta(weeks=plan_length_weeks)
        
        # Ensure we generate enough training plans
        if plan_id <= num_training_plans:
            training_plans.append((
                plan_id,
                runner_id,
                start_date,
                event_date,
                plan_description
            ))
            plan_id += 1


In [47]:
print(training_plans)

[(1, 179, datetime.date(2012, 2, 6), datetime.date(2012, 4, 16), 'Strength and Conditioning Focus Plan: 10-week build-up'), (2, 706, datetime.date(2011, 12, 26), datetime.date(2012, 4, 16), 'Beginner Marathon Plan: 16-week build-up'), (3, 658, datetime.date(2011, 12, 26), datetime.date(2012, 4, 16), 'Beginner Marathon Plan: 16-week build-up'), (4, 703, datetime.date(2012, 1, 23), datetime.date(2012, 4, 16), 'Intermediate Marathon Plan: 12-week build-up'), (5, 608, datetime.date(2011, 10, 31), datetime.date(2012, 4, 16), 'Elite Marathon Plan: 24-week build-up'), (6, 166, datetime.date(2011, 11, 28), datetime.date(2012, 4, 16), '5K to Marathon Transition Plan: 20-week build-up'), (7, 676, datetime.date(2012, 1, 23), datetime.date(2012, 4, 16), 'Intermediate Marathon Plan: 12-week build-up'), (8, 19, datetime.date(2011, 11, 28), datetime.date(2012, 4, 16), '5K to Marathon Transition Plan: 20-week build-up'), (9, 331, datetime.date(2011, 10, 31), datetime.date(2012, 4, 16), 'Elite Marathon

## Making a function to generate SQL insert statements, generating SQL statements, and combining all SQL scripts

In [49]:
# Function to generate SQL insert statements
def generate_sql_inserts(table_name, columns, data):
    sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES\n"
    values = []
    for row in data:
        formatted_values = ', '.join([f"'{str(v)}'" if isinstance(v, str) or isinstance(v, datetime.date) else str(v) for v in row])
        values.append(f"({formatted_values})")
    sql += ',\n'.join(values) + ";\n"
    return sql

In [51]:
# Generate SQL statements
runner_columns = ['runner_id', 'first_name', 'last_name', 'date_of_birth', 'country', 'elite_status']
event_columns = ['event_id', 'event_year', 'event_date', 'location']
result_columns = ['result_id', 'event_id', 'runner_id', 'finish_time', 'position', 'category']
sponsor_columns = ['sponsor_id', 'sponsor_name', 'runner_id']
training_plan_columns = ['plan_id', 'runner_id', 'start_date', 'end_date', 'description']

sql_runners = generate_sql_inserts('Runners', runner_columns, runners)
sql_events = generate_sql_inserts('Events', event_columns, events)
sql_results = generate_sql_inserts('Results', result_columns, results)
sql_sponsors = generate_sql_inserts('Sponsors', sponsor_columns, sponsors)
sql_training_plans = generate_sql_inserts('TrainingPlans', training_plan_columns, training_plans)


In [53]:
print(sql_runners)

INSERT INTO Runners (runner_id, first_name, last_name, date_of_birth, country, elite_status) VALUES
(1, 'Angelica', 'Middleton', '2006-01-04', 'Senegal'),
(2, 'Amy', 'White', '1990-08-05', 'Burundi'),
(3, 'Christy', 'Beard', '1976-02-20', 'Czech Republic'),
(4, 'Nicole', 'Ramirez', '1985-05-18', 'Aruba'),
(5, 'Nicole', 'Hernandez', '2001-05-11', 'United Kingdom'),
(6, 'James', 'Alvarado', '1994-06-09', 'American Samoa'),
(7, 'Andrew', 'Reed', '1941-04-23', 'Nauru'),
(8, 'Colleen', 'Bryant', '1949-08-23', 'Isle of Man'),
(9, 'Debbie', 'Steele', '1962-07-03', 'Slovakia (Slovak Republic)'),
(10, 'Tyler', 'Rodriguez', '2002-04-26', 'Honduras'),
(11, 'Ronald', 'Gibson', '1946-12-15', 'Costa Rica'),
(12, 'Denise', 'Long', '1948-03-23', 'Brunei Darussalam'),
(13, 'Douglas', 'Jackson', '1988-04-07', 'Bahamas'),
(14, 'Lisa', 'Irwin', '1977-01-09', 'Azerbaijan'),
(15, 'Juan', 'Elliott', '1964-01-28', 'British Indian Ocean Territory (Chagos Archipelago)'),
(16, 'Mark', 'Dawson', '1941-03-19', 'La

In [55]:
# Combine all SQL scripts
sql_script = sql_runners + sql_events + sql_results + sql_sponsors + sql_training_plans


In [57]:
print(sql_script)

INSERT INTO Runners (runner_id, first_name, last_name, date_of_birth, country, elite_status) VALUES
(1, 'Angelica', 'Middleton', '2006-01-04', 'Senegal'),
(2, 'Amy', 'White', '1990-08-05', 'Burundi'),
(3, 'Christy', 'Beard', '1976-02-20', 'Czech Republic'),
(4, 'Nicole', 'Ramirez', '1985-05-18', 'Aruba'),
(5, 'Nicole', 'Hernandez', '2001-05-11', 'United Kingdom'),
(6, 'James', 'Alvarado', '1994-06-09', 'American Samoa'),
(7, 'Andrew', 'Reed', '1941-04-23', 'Nauru'),
(8, 'Colleen', 'Bryant', '1949-08-23', 'Isle of Man'),
(9, 'Debbie', 'Steele', '1962-07-03', 'Slovakia (Slovak Republic)'),
(10, 'Tyler', 'Rodriguez', '2002-04-26', 'Honduras'),
(11, 'Ronald', 'Gibson', '1946-12-15', 'Costa Rica'),
(12, 'Denise', 'Long', '1948-03-23', 'Brunei Darussalam'),
(13, 'Douglas', 'Jackson', '1988-04-07', 'Bahamas'),
(14, 'Lisa', 'Irwin', '1977-01-09', 'Azerbaijan'),
(15, 'Juan', 'Elliott', '1964-01-28', 'British Indian Ocean Territory (Chagos Archipelago)'),
(16, 'Mark', 'Dawson', '1941-03-19', 'La