# Generating Marathon Data for SQL Practice

This is a Python Jupyter notebook for creating a full mock SQL database of marathon race results with thousands of records. This script simulates a realistic dataset, including both elite and recreational runners, with varied performance data across multiple marathon events. There are several steps to this:

1. **Generate 'Runners' Table** - Creates a list of 1000 runners, mixing famous elite and athletes and randomly generated recreational runners.
2. **Generate 'Events' Table** - Create data for different marathon events, spanning different years.
3. **Generate 'Results' Table** - Assign finishing times and positions to runners for each event.
4. **Generate 'Sponsors' Table** - Associates elite runners with sponsors.
5. **Generate 'Training Plans' Table** - Assign training plans to a subset of runners.

## Import packages

In [5]:
pip install faker

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install unidecode

Note: you may need to restart the kernel to use updated packages.


In [7]:
import random
from faker import Faker
import datetime
import pandas as pd

In [8]:
fake = Faker()

## Define the records for tables and helper functions

In [10]:
# Helper functions
def random_time():
    """Generate a random marathon finish time"""
    hours = random.randint(2, 5)
    minutes = random.randint(0, 59)
    seconds = random.randint(0, 59)
    return f'{hours:02}:{minutes:02}:{seconds:02}'

In [11]:
# Define the number of records
num_runners = 1000
num_events = 12 # 12 for the 6 marathons over 2 years (2012-2023)
num_results = num_runners * num_events
num_sponsors = 100
num_training_plans = 600

In [12]:
# Define the major marathons and their exact dates
marathons = [
    ("Boston Marathon", [
        "2012-04-16", "2013-04-15", "2014-04-21", "2015-04-20", "2016-04-18",
        "2017-04-17", "2018-04-16", "2019-04-15", "2020-10-11", "2021-10-11",
        "2022-04-18", "2023-04-17"
    ]),
    ("London Marathon", [
        "2012-04-22", "2013-04-21", "2014-04-21", "2015-04-26", "2016-04-24",
        "2017-04-23", "2018-04-22", "2019-04-28", "2020-10-04", "2021-10-03",
        "2022-10-02", "2023-04-23"
    ]),
    ("New York City Marathon", [
        "2012-11-04", "2013-11-03", "2014-11-02", "2015-11-01", "2016-11-06",
        "2017-11-05", "2018-11-04", "2019-11-03", "2020-12-13", "2021-11-07",
        "2022-11-06", "2023-11-05"
    ]),
    ("Berlin Marathon", [
        "2012-09-30", "2013-09-29", "2014-09-28", "2015-09-27", "2016-09-25",
        "2017-09-24", "2018-09-16", "2019-09-29", "2020-09-27", "2021-09-26",
        "2022-09-25", "2023-09-24"
    ]),
    ("Chicago Marathon", [
        "2012-10-07", "2013-10-13", "2014-10-12", "2015-10-11", "2016-10-09",
        "2017-10-08", "2018-10-07", "2019-10-13", "2020-10-11", "2021-10-10",
        "2022-10-09", "2023-10-08"
    ]),
    ("Tokyo Marathon", [
        "2012-02-26", "2013-02-24", "2014-02-23", "2015-02-22", "2016-02-28",
        "2017-02-26", "2018-02-25", "2019-03-03", "2020-03-01", "2021-03-07",
        "2022-03-06", "2023-03-05"
    ])
]

In [13]:
# Define predefined training plan descriptions
training_plan_descriptions = [
    "Beginner Marathon Plan: 16-week build-up",
    "Intermediate Marathon Plan: 12-week build-up",
    "Advanced Marathon Plan: 18-week build-up",
    "Elite Marathon Plan: 24-week build-up",
    "5K to Marathon Transition Plan: 20-week build-up",
    "Half-Marathon to Marathon Plan: 14-week build-up",
    "Strength and Conditioning Focus Plan: 10-week build-up",
    "Speed Work Focus Plan: 12-week build-up",
    "Long Distance Endurance Focus Plan: 22-week build-up",
    "Customized Marathon Plan: 16-week build-up"
]

## Generate Tables

In [15]:
import random
from faker import Faker
from unidecode import unidecode  # Import the unidecode function

# Locale-country map dictionary
locale_country_map = {
    'ar_AA': 'Arabic',
    'ar_AE': 'United Arab Emirates',
    'ar_BH': 'Bahrain',
    'ar_EG': 'Egypt',
    'ar_JO': 'Jordan',
    'ar_PS': 'Palestine',
    'ar_SA': 'Saudi Arabia',
    'az_AZ': 'Azerbaijan',
    'bg_BG': 'Bulgaria',
    'bn_BD': 'Bangladesh',
    'bs_BA': 'Bosnia and Herzegovina',
    'cs_CZ': 'Czech Republic',
    'da_DK': 'Denmark',
    'de_AT': 'Austria',
    'de_CH': 'Switzerland',
    'de_DE': 'Germany',
    'el_CY': 'Cyprus',
    'el_GR': 'Greece',
    'en_AU': 'Australia',
    'en_CA': 'Canada',
    'en_GB': 'United Kingdom',
    'en_IE': 'Ireland',
    'en_IN': 'India',
    'en_NZ': 'New Zealand',
    'en_PH': 'Philippines',
    'en_US': 'United States',
    'es_AR': 'Argentina',
    'es_CL': 'Chile',
    'es_CO': 'Colombia',
    'es_ES': 'Spain',
    'es_MX': 'Mexico',
    'et_EE': 'Estonia',
    'fa_IR': 'Iran',
    'fi_FI': 'Finland',
    'fr_BE': 'Belgium',
    'fr_CA': 'Canada',
    'fr_CH': 'Switzerland',
    'fr_FR': 'France',
    'ga_IE': 'Ireland',
    'he_IL': 'Israel',
    'hi_IN': 'India',
    'hr_HR': 'Croatia',
    'hu_HU': 'Hungary',
    'hy_AM': 'Armenia',
    'id_ID': 'Indonesia',
    'it_CH': 'Switzerland',
    'it_IT': 'Italy',
    'ja_JP': 'Japan',
    'ka_GE': 'Georgia',
    'ko_KR': 'South Korea',
    'lb_LU': 'Luxembourg',
    'lt_LT': 'Lithuania',
    'lv_LV': 'Latvia',
    'ne_NP': 'Nepal',
    'nl_BE': 'Belgium',
    'nl_NL': 'Netherlands',
    'no_NO': 'Norway',
    'pl_PL': 'Poland',
    'pt_BR': 'Brazil',
    'pt_PT': 'Portugal',
    'ro_RO': 'Romania',
    'ru_RU': 'Russia',
    'sk_SK': 'Slovakia',
    'sl_SI': 'Slovenia',
    'sq_AL': 'Albania',
    'sv_SE': 'Sweden',
    'ta_IN': 'India',
    'th_TH': 'Thailand',
    'tr_TR': 'Turkey',
    'uk_UA': 'Ukraine',
    'vi_VN': 'Vietnam',
    'zh_CN': 'China',
    'zh_TW': 'Taiwan',
    'zu_ZA': 'South Africa'
}

# Number of runners to generate
num_runners = 100  # Example number

# Generate Runners Table
runners = []
for i in range(1, num_runners + 1):
    # Randomly select a category for each runner
    category = random.choice(['Male', 'Female'])
    
    # Randomly select a locale corresponding to a country
    locale, country = random.choice(list(locale_country_map.items()))
    
    # Create a Faker instance with the selected locale
    localized_fake = Faker(locale)
    
    # Generate the runner's name and transliterate it to standard ASCII characters
    first_name = localized_fake.first_name_male() if category == 'Male' else localized_fake.first_name_female()
    last_name = localized_fake.last_name()

    # Transliterate names to ASCII
    first_name_ascii = unidecode(first_name)
    last_name_ascii = unidecode(last_name)
    
    # Generate the runner data
    runners.append((
        i,
        first_name_ascii,
        last_name_ascii,
        localized_fake.date_of_birth(minimum_age=18, maximum_age=90),
        category,  # Assign the randomly chosen category
        country
    ))

# Example output
for runner in runners[:50]:  # Print first 50 runners
    print(runner)



(1, 'Filip', 'Schepens', datetime.date(1950, 7, 20), 'Male', 'Belgium')
(2, 'Aurelie', 'Deleze', datetime.date(1985, 7, 18), 'Female', 'Switzerland')
(3, 'Lisa', 'Mckee', datetime.date(1980, 3, 23), 'Female', 'Canada')
(4, 'Matija', 'Lucic', datetime.date(1963, 4, 25), 'Male', 'Croatia')
(5, 'Saksham', 'Chada', datetime.date(1943, 7, 13), 'Female', 'India')
(6, 'Zahra', 'Pangestu', datetime.date(1939, 2, 17), 'Female', 'Indonesia')
(7, 'Elena', 'Rhowbinyan', datetime.date(1935, 8, 17), 'Female', 'Armenia')
(8, 'Pascal', 'Roht', datetime.date(1957, 4, 4), 'Male', 'Germany')
(9, 'Vusumuzi', 'Mhlophe', datetime.date(1963, 6, 14), 'Male', 'South Africa')
(10, 'Bogdanna', 'Shablii', datetime.date(1940, 9, 12), 'Female', 'Ukraine')
(11, 'lbyA', 'SHqd', datetime.date(1970, 12, 27), 'Male', 'Israel')
(12, 'Danielle', 'Foster', datetime.date(1996, 10, 6), 'Female', 'Switzerland')
(13, 'Franjo', 'Turk', datetime.date(1979, 1, 29), 'Male', 'Slovenia')
(14, 'Eleonore', 'Racine', datetime.date(1999

In [16]:
unique_countries = set(runner[5] for runner in runners)
print("Unique countries:", unique_countries)

Unique countries: {'United States', 'Turkey', 'Taiwan', 'New Zealand', 'India', 'Croatia', 'Czech Republic', 'Hungary', 'Greece', 'Bosnia and Herzegovina', 'Lithuania', 'Luxembourg', 'Palestine', 'China', 'Armenia', 'Cyprus', 'Israel', 'Ukraine', 'Switzerland', 'Nepal', 'Estonia', 'South Korea', 'United Arab Emirates', 'Brazil', 'South Africa', 'Australia', 'Norway', 'Slovenia', 'Egypt', 'Canada', 'Germany', 'Arabic', 'Ireland', 'Saudi Arabia', 'Italy', 'Russia', 'Bulgaria', 'Belgium', 'Iran', 'Jordan', 'Latvia', 'Finland', 'Philippines', 'Thailand', 'Japan', 'Indonesia'}


In [17]:
print(runners)

[(1, 'Filip', 'Schepens', datetime.date(1950, 7, 20), 'Male', 'Belgium'), (2, 'Aurelie', 'Deleze', datetime.date(1985, 7, 18), 'Female', 'Switzerland'), (3, 'Lisa', 'Mckee', datetime.date(1980, 3, 23), 'Female', 'Canada'), (4, 'Matija', 'Lucic', datetime.date(1963, 4, 25), 'Male', 'Croatia'), (5, 'Saksham', 'Chada', datetime.date(1943, 7, 13), 'Female', 'India'), (6, 'Zahra', 'Pangestu', datetime.date(1939, 2, 17), 'Female', 'Indonesia'), (7, 'Elena', 'Rhowbinyan', datetime.date(1935, 8, 17), 'Female', 'Armenia'), (8, 'Pascal', 'Roht', datetime.date(1957, 4, 4), 'Male', 'Germany'), (9, 'Vusumuzi', 'Mhlophe', datetime.date(1963, 6, 14), 'Male', 'South Africa'), (10, 'Bogdanna', 'Shablii', datetime.date(1940, 9, 12), 'Female', 'Ukraine'), (11, 'lbyA', 'SHqd', datetime.date(1970, 12, 27), 'Male', 'Israel'), (12, 'Danielle', 'Foster', datetime.date(1996, 10, 6), 'Female', 'Switzerland'), (13, 'Franjo', 'Turk', datetime.date(1979, 1, 29), 'Male', 'Slovenia'), (14, 'Eleonore', 'Racine', date

In [18]:
runners_countries = runners[5]
print(runners_countries)

(6, 'Zahra', 'Pangestu', datetime.date(1939, 2, 17), 'Female', 'Indonesia')


In [19]:
# Generate Events Table
events = []
event_id = 1
for marathon_name, dates in marathons:
    for event_date in dates:
        events.append((
            event_id,
            marathon_name,
            datetime.datetime.strptime(event_date, "%Y-%m-%d").date()  # Ensure event_date is a datetime.date object 
        ))
        event_id += 1

In [20]:
print(events)

[(1, 'Boston Marathon', datetime.date(2012, 4, 16)), (2, 'Boston Marathon', datetime.date(2013, 4, 15)), (3, 'Boston Marathon', datetime.date(2014, 4, 21)), (4, 'Boston Marathon', datetime.date(2015, 4, 20)), (5, 'Boston Marathon', datetime.date(2016, 4, 18)), (6, 'Boston Marathon', datetime.date(2017, 4, 17)), (7, 'Boston Marathon', datetime.date(2018, 4, 16)), (8, 'Boston Marathon', datetime.date(2019, 4, 15)), (9, 'Boston Marathon', datetime.date(2020, 10, 11)), (10, 'Boston Marathon', datetime.date(2021, 10, 11)), (11, 'Boston Marathon', datetime.date(2022, 4, 18)), (12, 'Boston Marathon', datetime.date(2023, 4, 17)), (13, 'London Marathon', datetime.date(2012, 4, 22)), (14, 'London Marathon', datetime.date(2013, 4, 21)), (15, 'London Marathon', datetime.date(2014, 4, 21)), (16, 'London Marathon', datetime.date(2015, 4, 26)), (17, 'London Marathon', datetime.date(2016, 4, 24)), (18, 'London Marathon', datetime.date(2017, 4, 23)), (19, 'London Marathon', datetime.date(2018, 4, 22)),

In [21]:
# Generate Results Table
results = []
assigned_pairs = set()  # Set to keep track of assigned (runner_id, event_id) pairs

In [37]:
assigned_pairs = set()

for i in range(1, num_results + 1):
    # Limit the number of attempts to find a unique (runner_id, event_id) pair
    attempts = 0
    max_attempts = 100  # Adjust this number as needed
    unique_pair_found = False
    
    while attempts < max_attempts:
        runner_id = random.randint(1, num_runners)
        event_id = ((i - 1) % num_events) + 1
        
        if (runner_id, event_id) not in assigned_pairs:
            assigned_pairs.add((runner_id, event_id))  # Record the (runner_id, event_id) pair
            unique_pair_found = True
            break
        
        attempts += 1
    
    if not unique_pair_found:
        print(f"Could not find a unique (runner_id, event_id) pair after {max_attempts} attempts. Exiting loop.")
        break
    
    # Get the runner details
    runner = next(r for r in runners if r[0] == runner_id)
    finish_time = random_time()
    position = random.randint(1, 5000)
    category = runner[4]  # Use the runner's category from the runners table
    is_elite = False
    
    # Determine if the runner's finish time is elite
    hours = float(finish_time[:2])  # Assumes 'HH:MM:SS' format
    
    if category == 'Male' and hours < 2.15:
        is_elite = True
    elif category == 'Female' and hours < 2.30:
        is_elite = True
    elif 'Wheelchair' in category and hours < 1.20:
        is_elite = True
    
    results.append((
        i,
        event_id,
        runner_id,
        finish_time,
        position,
        category,
        is_elite
    ))

# Check results
for result in results[:100]:
    print(result)


Could not find a unique (runner_id, event_id) pair after 100 attempts. Exiting loop.
(1, 1, 100, '03:10:35', 4383, 'Female', False)
(2, 2, 61, '03:39:41', 3244, 'Male', False)
(3, 3, 6, '03:56:15', 2969, 'Female', False)
(4, 4, 84, '05:30:37', 3517, 'Male', False)
(5, 5, 56, '05:59:22', 4155, 'Male', False)
(6, 6, 52, '03:02:37', 4709, 'Female', False)
(7, 7, 30, '04:02:38', 2025, 'Female', False)
(8, 8, 37, '02:30:55', 4543, 'Female', True)
(9, 9, 30, '04:09:12', 384, 'Female', False)
(10, 10, 13, '02:22:01', 4908, 'Male', True)


In [39]:
# Verify that no runner has participated in the same event more than once
duplicates = [(runner_id, event_id) for (runner_id, event_id) in assigned_pairs]
if len(duplicates) != len(assigned_pairs):
    print("Error: A runner has participated in the same event more than once.")
else:
    print("Verification passed: No runner has participated in the same event more than once.")


Verification passed: No runner has participated in the same event more than once.


In [41]:
print(results)

[(1, 1, 100, '03:10:35', 4383, 'Female', False), (2, 2, 61, '03:39:41', 3244, 'Male', False), (3, 3, 6, '03:56:15', 2969, 'Female', False), (4, 4, 84, '05:30:37', 3517, 'Male', False), (5, 5, 56, '05:59:22', 4155, 'Male', False), (6, 6, 52, '03:02:37', 4709, 'Female', False), (7, 7, 30, '04:02:38', 2025, 'Female', False), (8, 8, 37, '02:30:55', 4543, 'Female', True), (9, 9, 30, '04:09:12', 384, 'Female', False), (10, 10, 13, '02:22:01', 4908, 'Male', True), (11, 11, 64, '04:12:48', 3698, 'Male', False), (12, 12, 91, '02:21:21', 4566, 'Female', True), (13, 1, 20, '05:04:59', 648, 'Male', False), (14, 2, 97, '04:20:03', 3311, 'Female', False), (15, 3, 58, '02:39:37', 214, 'Male', True), (16, 4, 60, '04:35:58', 4431, 'Male', False), (17, 5, 8, '05:55:52', 4531, 'Male', False), (18, 6, 30, '05:55:57', 1104, 'Female', False), (19, 7, 51, '02:42:30', 4166, 'Female', True), (20, 8, 32, '03:19:37', 647, 'Male', False), (21, 9, 36, '05:57:07', 1966, 'Female', False), (22, 10, 86, '05:25:36', 19

In [43]:
# Identify elite runners based on the 'results' table
elite_runners = [runner_id for runner_id, _, _, _, _, _, is_elite in results if is_elite]

In [45]:
print(elite_runners)

[8, 10, 12, 15, 19, 26, 30, 33, 34, 38, 43, 47, 49, 53, 56, 64, 65, 68, 76, 77, 78, 79, 82, 84, 90, 94, 95, 109, 113, 124, 128, 134, 135, 137, 141, 143, 144, 146, 150, 154, 163, 166, 168, 174, 181, 184, 188, 190, 205, 207, 216, 217, 219, 222, 229, 232, 233, 238, 251, 254, 258, 268, 270, 271, 272, 278, 285, 286, 288, 291, 294, 295, 296, 299, 304, 305, 306, 307, 311, 323, 324, 326, 327, 330, 331, 345, 347, 352, 361, 365, 366, 369, 370, 374, 381, 387, 388, 390, 394, 398, 400, 401, 403, 410, 412, 417, 421, 423, 441, 444, 445, 449, 450, 454, 457, 461, 467, 478, 480, 484, 488, 497, 503, 508, 518, 520, 525, 526, 527, 530, 532, 535, 539, 545, 547, 548, 554, 556, 560, 561, 566, 572, 575, 577, 579, 587, 589, 594, 596, 599, 603, 607, 608, 617, 623, 627, 628, 631, 636, 638, 647, 648, 653, 655, 660, 666, 671, 672, 674, 678, 684, 687, 697, 701, 708, 712, 713, 715, 718, 721, 727, 728, 729, 732, 737, 740, 746, 756, 761, 762, 766, 772, 774, 780, 785, 786, 789, 792, 794, 801, 804, 805, 815, 819, 821, 82

In [47]:
# Generate Sponsors Table 
sponsors = [
    "Nike", "Adidas", "Asics", "Saucony", "Hoka",
    "Brooks", "New Balance", "Puma", "Under Armour", "Tracksmith"
]

# Select 50% of elite runners to have sponsors
sponsored_runners = random.sample(elite_runners, len(elite_runners) // 2)

# Generate Sponsors Athletes Table
sponsored_athletes = []
used_sponsors = set()  # To keep track of assigned sponsors

for i, runner_id in enumerate(sponsored_runners, start=1):
    sponsor_name = random.choice(sponsors)
    
    # Ensure the runner doesn't already have a sponsor
    while runner_id in used_sponsors:
        runner_id = random.choice(sponsored_runners)
    
    sponsored_athletes.append((
        i,
        sponsor_name,
        runner_id
    ))
    
    # Mark this runner as having a sponsor
    used_sponsors.add(runner_id)

# Verification code to check if any runner appears more than once
runner_ids = [data[2] for data in sponsored_athletes]
duplicate_runners = set([runner_id for runner_id in runner_ids if runner_ids.count(runner_id) > 1])

if len(duplicate_runners) == 0:
    print("Verification Passed: No runner appears more than once in the sponsors table.")
else:
    print(f"Verification Failed: The following runners have multiple sponsors: {duplicate_runners}")


Verification Passed: No runner appears more than once in the sponsors table.


In [49]:
print(sponsored_athletes)

[(1, 'Tracksmith', 47), (2, 'Adidas', 721), (3, 'New Balance', 286), (4, 'Nike', 146), (5, 'Brooks', 766), (6, 'Adidas', 1133), (7, 'Brooks', 547), (8, 'Adidas', 1065), (9, 'Adidas', 967), (10, 'Hoka', 311), (11, 'Under Armour', 631), (12, 'New Balance', 457), (13, 'Puma', 1095), (14, 'Puma', 623), (15, 'Nike', 836), (16, 'Hoka', 628), (17, 'Adidas', 805), (18, 'Asics', 135), (19, 'Adidas', 38), (20, 'New Balance', 444), (21, 'Asics', 740), (22, 'Saucony', 1057), (23, 'Adidas', 94), (24, 'Saucony', 831), (25, 'Under Armour', 370), (26, 'Hoka', 756), (27, 'Adidas', 251), (28, 'Tracksmith', 981), (29, 'Puma', 554), (30, 'Puma', 878), (31, 'Asics', 713), (32, 'Tracksmith', 113), (33, 'Tracksmith', 855), (34, 'Saucony', 1007), (35, 'Under Armour', 525), (36, 'Puma', 1029), (37, 'Nike', 712), (38, 'Hoka', 30), (39, 'Under Armour', 82), (40, 'Adidas', 660), (41, 'Under Armour', 137), (42, 'Under Armour', 478), (43, 'Puma', 687), (44, 'Brooks', 548), (45, 'Saucony', 1085), (46, 'Saucony', 100

In [51]:
# Generate Sponsors Table
sponsors = [
    (1, "Nike"),
    (2, "Adidas"),
    (3, "Asics"),
    (4, "Saucony"),
    (5, "Hoka"),
    (6, "Brooks"),
    (7, "New Balance"),
    (8, "Puma"),
    (9, "Under Armour"),
    (10, "Tracksmith")
]
print(sponsors)

[(1, 'Nike'), (2, 'Adidas'), (3, 'Asics'), (4, 'Saucony'), (5, 'Hoka'), (6, 'Brooks'), (7, 'New Balance'), (8, 'Puma'), (9, 'Under Armour'), (10, 'Tracksmith')]


In [53]:
training_plan_durations = {
    "Beginner Marathon Plan: 16-week build-up": 16,
    "Intermediate Marathon Plan: 12-week build-up": 12,
    "Advanced Marathon Plan: 18-week build-up": 18,
    "Elite Marathon Plan: 24-week build-up": 24,
    "5K to Marathon Transition Plan: 20-week build-up": 20,
    "Half-Marathon to Marathon Plan: 14-week build-up": 14,
    "Strength and Conditioning Focus Plan: 10-week build-up": 10,
    "Speed Work Focus Plan: 12-week build-up": 12,
    "Long Distance Endurance Focus Plan: 22-week build-up": 22,
    "Customized Marathon Plan: 16-week build-up": 16
}

In [55]:
# Generate Training Plan Descriptions Table
training_plan_descriptions_table = []
plan_id = 1

for plan_description, plan_length_weeks in training_plan_durations.items():
    training_plan_descriptions_table.append((
        plan_id,  # Primary Key
        plan_description,
        plan_length_weeks
    ))
    plan_id += 1


print(training_plan_descriptions_table)

[(1, 'Beginner Marathon Plan: 16-week build-up', 16), (2, 'Intermediate Marathon Plan: 12-week build-up', 12), (3, 'Advanced Marathon Plan: 18-week build-up', 18), (4, 'Elite Marathon Plan: 24-week build-up', 24), (5, '5K to Marathon Transition Plan: 20-week build-up', 20), (6, 'Half-Marathon to Marathon Plan: 14-week build-up', 14), (7, 'Strength and Conditioning Focus Plan: 10-week build-up', 10), (8, 'Speed Work Focus Plan: 12-week build-up', 12), (9, 'Long Distance Endurance Focus Plan: 22-week build-up', 22), (10, 'Customized Marathon Plan: 16-week build-up', 16)]


In [57]:
# Select 60% of runners to have training plans
all_runner_ids = [runner[0] for runner in runners]
selected_runner_ids = random.sample(all_runner_ids, int(len(all_runner_ids) * 0.60))

In [59]:
# Generate Training Plans Table
training_plans = []
plan_id = 1

# Create a lookup dictionary to get the ID based on the description
description_to_id = {description: plan_id for plan_id, description, _ in training_plan_descriptions_table}

for event in events:
    event_id = event[0]
    event_name = event[1]
    event_date = event[2]  # event_date is already a datetime.date object
    
    for runner_id in selected_runner_ids:
        # Randomly select a training plan description
        plan_description = random.choice(list(training_plan_durations.keys()))
        
        # Get the duration of the training plan in weeks
        plan_length_weeks = training_plan_durations[plan_description]
        
        # Calculate the start date
        start_date = event_date - datetime.timedelta(weeks=plan_length_weeks)
        
        # Get the plan_id from the description_to_id dictionary
        plan_id_ref = description_to_id[plan_description]
        
        # Ensure we generate enough training plans
        if plan_id <= num_training_plans:
            training_plans.append((
                plan_id,  # Primary Key
                runner_id,
                start_date,
                event_date,
                plan_id_ref  # Reference the plan by ID
            ))
            plan_id += 1


In [61]:
print(training_plans)

[(1, 81, datetime.date(2011, 11, 28), datetime.date(2012, 4, 16), 5), (2, 72, datetime.date(2012, 2, 6), datetime.date(2012, 4, 16), 7), (3, 10, datetime.date(2012, 2, 6), datetime.date(2012, 4, 16), 7), (4, 88, datetime.date(2011, 11, 14), datetime.date(2012, 4, 16), 9), (5, 96, datetime.date(2012, 1, 9), datetime.date(2012, 4, 16), 6), (6, 20, datetime.date(2011, 12, 26), datetime.date(2012, 4, 16), 1), (7, 55, datetime.date(2011, 11, 28), datetime.date(2012, 4, 16), 5), (8, 29, datetime.date(2011, 12, 26), datetime.date(2012, 4, 16), 10), (9, 76, datetime.date(2011, 10, 31), datetime.date(2012, 4, 16), 4), (10, 50, datetime.date(2011, 11, 28), datetime.date(2012, 4, 16), 5), (11, 35, datetime.date(2012, 1, 9), datetime.date(2012, 4, 16), 6), (12, 68, datetime.date(2012, 1, 9), datetime.date(2012, 4, 16), 6), (13, 28, datetime.date(2012, 1, 9), datetime.date(2012, 4, 16), 6), (14, 14, datetime.date(2011, 12, 12), datetime.date(2012, 4, 16), 3), (15, 18, datetime.date(2011, 12, 12), d

## Making a function to generate SQL insert statements, generating SQL statements, and combining all SQL scripts

In [64]:
# Function to generate SQL insert statements
def generate_sql_inserts(table_name, columns, data):
    sql = f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES\n"
    values = []
    for row in data:
        formatted_values = ', '.join([f"'{str(v)}'" if isinstance(v, str) or isinstance(v, datetime.date) else str(v) for v in row])
        values.append(f"({formatted_values})")
    sql += ',\n'.join(values) + ";\n"
    return sql

In [66]:
# Generate SQL statements
runner_columns = ['runner_id', 'first_name', 'last_name', 'date_of_birth', 'country', 'elite_status']
event_columns = ['event_id', 'event_year', 'event_date', 'location']
result_columns = ['result_id', 'event_id', 'runner_id', 'finish_time', 'position', 'category']
sponsor_columns = ['sponsor_id', 'sponsor_name']
sponsored_athletes_columns = ['sponsor_id', 'sponsor_name', 'runner_id']
training_plan_columns = ['plan_id', 'runner_id', 'start_date', 'end_date', 'description']
training_plan_descriptions_columns = ['plan_id','plan_description','plan_length_weeks']


sql_runners = generate_sql_inserts('Runners', runner_columns, runners)
sql_events = generate_sql_inserts('Events', event_columns, events)
sql_results = generate_sql_inserts('Results', result_columns, results)
sql_sponsors = generate_sql_inserts('Sponsors', sponsor_columns, sponsors)
sql_sponsored_athletes = generate_sql_inserts('Sponsored_Athletes', sponsored_athletes_columns, sponsored_athletes)
sql_training_plans = generate_sql_inserts('Training_Plans', training_plan_columns, training_plans)
sql_training_plan_descriptions_table = generate_sql_inserts('Training_Plans_Descriptions', training_plan_descriptions_columns,training_plan_descriptions_table)


In [68]:
print(sql_runners)

INSERT INTO Runners (runner_id, first_name, last_name, date_of_birth, country, elite_status) VALUES
(1, 'Filip', 'Schepens', '1950-07-20', 'Male', 'Belgium'),
(2, 'Aurelie', 'Deleze', '1985-07-18', 'Female', 'Switzerland'),
(3, 'Lisa', 'Mckee', '1980-03-23', 'Female', 'Canada'),
(4, 'Matija', 'Lucic', '1963-04-25', 'Male', 'Croatia'),
(5, 'Saksham', 'Chada', '1943-07-13', 'Female', 'India'),
(6, 'Zahra', 'Pangestu', '1939-02-17', 'Female', 'Indonesia'),
(7, 'Elena', 'Rhowbinyan', '1935-08-17', 'Female', 'Armenia'),
(8, 'Pascal', 'Roht', '1957-04-04', 'Male', 'Germany'),
(9, 'Vusumuzi', 'Mhlophe', '1963-06-14', 'Male', 'South Africa'),
(10, 'Bogdanna', 'Shablii', '1940-09-12', 'Female', 'Ukraine'),
(11, 'lbyA', 'SHqd', '1970-12-27', 'Male', 'Israel'),
(12, 'Danielle', 'Foster', '1996-10-06', 'Female', 'Switzerland'),
(13, 'Franjo', 'Turk', '1979-01-29', 'Male', 'Slovenia'),
(14, 'Eleonore', 'Racine', '1999-10-22', 'Female', 'Canada'),
(15, 'ltyn', 'bkr bn `bd mn@', '2003-09-30', 'Female

In [70]:
# Combine all SQL scripts
sql_script = sql_runners + sql_events + sql_results + sql_sponsors + sql_training_plans + sql_sponsored_athletes + sql_training_plan_descriptions_table


In [72]:
print(sql_script)

INSERT INTO Runners (runner_id, first_name, last_name, date_of_birth, country, elite_status) VALUES
(1, 'Filip', 'Schepens', '1950-07-20', 'Male', 'Belgium'),
(2, 'Aurelie', 'Deleze', '1985-07-18', 'Female', 'Switzerland'),
(3, 'Lisa', 'Mckee', '1980-03-23', 'Female', 'Canada'),
(4, 'Matija', 'Lucic', '1963-04-25', 'Male', 'Croatia'),
(5, 'Saksham', 'Chada', '1943-07-13', 'Female', 'India'),
(6, 'Zahra', 'Pangestu', '1939-02-17', 'Female', 'Indonesia'),
(7, 'Elena', 'Rhowbinyan', '1935-08-17', 'Female', 'Armenia'),
(8, 'Pascal', 'Roht', '1957-04-04', 'Male', 'Germany'),
(9, 'Vusumuzi', 'Mhlophe', '1963-06-14', 'Male', 'South Africa'),
(10, 'Bogdanna', 'Shablii', '1940-09-12', 'Female', 'Ukraine'),
(11, 'lbyA', 'SHqd', '1970-12-27', 'Male', 'Israel'),
(12, 'Danielle', 'Foster', '1996-10-06', 'Female', 'Switzerland'),
(13, 'Franjo', 'Turk', '1979-01-29', 'Male', 'Slovenia'),
(14, 'Eleonore', 'Racine', '1999-10-22', 'Female', 'Canada'),
(15, 'ltyn', 'bkr bn `bd mn@', '2003-09-30', 'Female

In [74]:
with open("create_Marathon_results_database.sql", "w") as file:
    file.write(sql_script)