# Roller Coaster: Second Dataset

In [1]:
# All the necessary libraries for this project
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sklearn
import itertools
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.pipeline import Pipeline

In [2]:
# Helper function drawing a nicely formatted heatmap
def heatmap(data, row_labels, col_labels, figsize = (20,12), cmap = "YlGn",
            cbar_kw={}, cbarlabel="", valfmt="{x:.2f}",
            textcolors=("black", "white"), threshold=None):
    """
    Create a heatmap from a numpy array and two lists of labels. 

    Parameters
    ----------
    data
        A 2D numpy array of shape (M, N).
    row_labels
        A list or array of length M with the labels for the rows.
    col_labels
        A list or array of length N with the labels for the columns.
    ax
        A `matplotlib.axes.Axes` instance to which the heatmap is plotted.  If
        not provided, use current axes or create a new one.  Optional.
    cmap
        A string that specifies the colormap to use. Look at matplotlib docs for information.
        Optional.
    cbar_kw
        A dictionary with arguments to `matplotlib.Figure.colorbar`.  Optional.
    cbarlabel
        The label for the colorbar.  Optional.
    valfmt
        The format of the annotations inside the heatmap.  This should either
        use the string format method, e.g. "$ {x:.2f}", or be a
        `matplotlib.ticker.Formatter`.  Optional.
    textcolors
        A pair of colors.  The first is used for values below a threshold,
        the second for those above.  Optional.
    threshold
        Value in data units according to which the colors from textcolors are
        applied.  If None (the default) uses the middle of the colormap as
    """

    plt.figure(figsize = figsize)
    ax = plt.gca()

    # Plot the heatmap
    im = ax.imshow(data,cmap=cmap)

    # Create colorbar
    cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw)
    cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")

    # Show all ticks and label them with the respective list entries.
    ax.set_xticks(np.arange(data.shape[1]))
    ax.set_xticklabels(col_labels)
    
    ax.set_yticks(np.arange(data.shape[0]))
    ax.set_yticklabels(row_labels)

    # Let the horizontal axes labeling appear on top.
    ax.tick_params(top=True, bottom=False,
                   labeltop=True, labelbottom=False)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=-30, ha="right",
             rotation_mode="anchor")

    # Turn spines off and create white grid.
    ax.spines[:].set_visible(False)

    ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
    ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
    ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
    ax.tick_params(which="minor", bottom=False, left=False)

    
    # Normalize the threshold to the images color range.
    if threshold is not None:
        threshold = im.norm(threshold)
    else:
        threshold = im.norm(data.max())/2.

    # Set default alignment to center, but allow it to be
    # overwritten by textkw.
    kw = dict(horizontalalignment="center",
              verticalalignment="center")

    # Get the formatter in case a string is supplied
    if isinstance(valfmt, str):
        valfmt = matplotlib.ticker.StrMethodFormatter(valfmt)

    # Loop over the data and create a `Text` for each "pixel".
    # Change the text's color depending on the data.
    texts = []
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            kw.update(color=textcolors[int(im.norm(data[i, j]) > threshold)])
            text = im.axes.text(j, i, valfmt(data[i, j], None), **kw)
            texts.append(text)

In [3]:
# Helper function that allows you to draw nicely formatted confusion matrices
def draw_confusion_matrix(y, yhat, classes):
    '''
        Draws a confusion matrix for the given target and predictions
        Adapted from scikit-learn and discussion example.
    '''
    plt.cla()
    plt.clf()
    matrix = confusion_matrix(y, yhat)
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.YlOrBr)
    plt.title("Confusion Matrix")
    plt.colorbar()
    num_classes = len(classes)
    plt.xticks(np.arange(num_classes), classes, rotation=90)
    plt.yticks(np.arange(num_classes), classes)
    
    fmt = 'd'
    thresh = matrix.max() / 2.
    for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])):
        plt.text(j, i, format(matrix[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if matrix[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

In [4]:
df = pd.read_csv('coaster_db.csv')
df.head()

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),6 mph (9.7 km/h),Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,,Lift Packed,...,6 mph,9.7 km/h,6.0,mph,6.0,50.0,ft,,0,2.9
1,Flip Flap Railway,,,Sea Lion Park,Removed,1895,Wood,Lina Beecher,,,...,,,,,,,,,1,12.0
2,Switchback Railway (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,...,,,,,,,,,0,
3,Loop the Loop (Coney Island),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,
4,Loop the Loop (Young's Pier),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,


In [5]:
# Take note of the variables, these are the quantitative variables
df.describe()

Unnamed: 0,Inversions,year_introduced,latitude,longitude,speed1_value,speed_mph,height_value,height_ft,Inversions_clean,Gforce_clean
count,932.0,1087.0,812.0,812.0,937.0,937.0,965.0,171.0,1087.0,362.0
mean,1.54721,1994.986201,38.373484,-41.595373,53.850374,48.617289,89.575171,101.996491,1.326587,3.824006
std,2.114073,23.475248,15.516596,72.285227,23.385518,16.678031,136.246444,67.329092,2.030854,0.989998
min,0.0,1884.0,-48.2617,-123.0357,5.0,5.0,4.0,13.1,0.0,0.8
25%,0.0,1989.0,35.03105,-84.5522,40.0,37.3,44.0,51.8,0.0,3.4
50%,0.0,2000.0,40.2898,-76.6536,50.0,49.7,79.0,91.2,0.0,4.0
75%,3.0,2010.0,44.7996,2.7781,63.0,58.0,113.0,131.2,2.0,4.5
max,14.0,2022.0,63.2309,153.4265,240.0,149.1,3937.0,377.3,14.0,12.0


In [6]:
# Take note of the 'object' dtypes. These will have to be dropped, converted to numeric, or encoded
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   coaster_name                   1087 non-null   object 
 1   Length                         953 non-null    object 
 2   Speed                          937 non-null    object 
 3   Location                       1087 non-null   object 
 4   Status                         874 non-null    object 
 5   Opening date                   837 non-null    object 
 6   Type                           1087 non-null   object 
 7   Manufacturer                   1028 non-null   object 
 8   Height restriction             831 non-null    object 
 9   Model                          744 non-null    object 
 10  Height                         965 non-null    object 
 11  Inversions                     932 non-null    float64
 12  Lift/launch system             795 non-null    o

**Some of these columns we will want to drop and some we will want to modify/clean up so as to use them properly.** 

    -For example, with speed, we can just use 'speed_mph' and drop the other types of speed and units. They're not telling us different useful info but the same speed in different units. For these we will select what's already in the cleanest format which is 'speed_mph' and we will treat other columns with similar issues the same
    
    -Some columns like 'Flash Pass available' simply don't seem useful for our purposes and we can just get rid of them

    -For columns like 'Length' we are given only one column with both the number and unit. We just want the number and not the unit, so we can clean up the formatting to just give us the numeric value we want, as we can do with similarly formatted columns

In [7]:
columns_to_drop = ['Speed', 'Type', 'Height', 'Opening date', 'Park section', 'Soft opening date', 'Fast Lane available', 'Replaced',
                    'Fastrack available', 'Soft opening date.1','Closing date', 'Opened', 'Replaced by', 'Website',
                    'Flash Pass available', 'Flash Pass Available', 'Must transfer from wheelchair', 'Single rider line available', 
                     'Flash Pass available', 'speed1', 'speed2', 'speed1_value', 'speed1_unit', 'height_value', 
                    'height_unit', 'Name', 'G-force']
df.drop(columns=columns_to_drop, inplace=True)
df.head()

Unnamed: 0,coaster_name,Length,Location,Status,Manufacturer,Height restriction,Model,Inversions,Lift/launch system,Cost,...,Restraints,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),Coney Island,Removed,LaMarcus Adna Thompson,,Lift Packed,,gravity,,...,,1884,40.574,-73.978,Wood,1884-06-16,6.0,,0,2.9
1,Flip Flap Railway,,Sea Lion Park,Removed,Lina Beecher,,,1.0,,,...,,1895,40.578,-73.979,Wood,1895-01-01,,,1,12.0
2,Switchback Railway (Euclid Beach Park),,"Cleveland, Ohio, United States",Closed,,,,,,,...,,1896,41.58,-81.57,Other,,,,0,
3,Loop the Loop (Coney Island),,Other,Removed,Edwin Prescott,,,1.0,,,...,,1901,40.5745,-73.978,Steel,1901-01-01,,,1,
4,Loop the Loop (Young's Pier),,Other,Removed,Edwin Prescott,,,1.0,,,...,,1901,39.3538,-74.4342,Steel,1901-01-01,,,1,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 30 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   coaster_name        1087 non-null   object 
 1   Length              953 non-null    object 
 2   Location            1087 non-null   object 
 3   Status              874 non-null    object 
 4   Manufacturer        1028 non-null   object 
 5   Height restriction  831 non-null    object 
 6   Model               744 non-null    object 
 7   Inversions          932 non-null    float64
 8   Lift/launch system  795 non-null    object 
 9   Cost                382 non-null    object 
 10  Trains              718 non-null    object 
 11  Duration            765 non-null    object 
 12  Capacity            575 non-null    object 
 13  Designer            578 non-null    object 
 14  Max vertical angle  357 non-null    object 
 15  Drop                494 non-null    object 
 16  Track 

In [9]:
df['Height restriction'].unique()[:50]

array([nan, '46\xa0in (117\xa0cm)', '100\xa0cm (3\xa0ft 3\xa0in)',
       '48\xa0in (122\xa0cm)', '42\xa0in (107\xa0cm)',
       '50\xa0in (127\xa0cm)', '52\xa0in (132\xa0cm)',
       '54\xa0in (137\xa0cm)', '140\xa0cm (4\xa0ft 7\xa0in)',
       '36\xa0in (91\xa0cm)', '40\xa0in (102\xa0cm)',
       '44\xa0in (112\xa0cm)', '42 or 48\xa0in (107 or 122\xa0cm)',
       '20\xa0in (51\xa0cm)', '47.3\xa0in (120\xa0cm)',
       '122\xa0cm (4\xa0ft 0\xa0in)', '102\xa0cm (3\xa0ft 4\xa0in)',
       '120\xa0cm (3\xa0ft 11\xa0in)', '130\xa0cm (4\xa0ft 3\xa0in)',
       '90\xa0cm (2\xa0ft 11\xa0in)',
       '120–205\xa0cm (3\xa0ft 11\xa0in–6\xa0ft 9\xa0in)',
       '59–77\xa0in (150–196\xa0cm)', '34\xa0in (86\xa0cm)',
       'Must be able to straddle seat with feet on floor.',
       '47.24\xa0in (120\xa0cm)', '54–76\xa0in (137–193\xa0cm)',
       '43.3\xa0in (110\xa0cm)', '51\xa0in (130\xa0cm)',
       '54–78[1]\xa0in (137–198\xa0cm)', '35\xa0in (89\xa0cm)',
       '110\xa0cm (3\xa0ft 7\xa0in)',
  

In [10]:
# Clean length
df['Length_ft'] = df['Length'].str.extract(r'([\d,\.]+)\s*ft')
df['Length_ft'] = df['Length_ft'].str.replace(',', '').astype(float)

# Clean height restriction, need function
def extract_inches(value):
    if not isinstance(value, str):
        return np.nan
    
    # Case 1: Starts with inches (e.g., "46 in (117 cm)")
    match_in = re.search(r"([\d\.]+)\s*in", value)
    if match_in and not value.strip().startswith(("cm", "CM", "m")):
        return float(match_in.group(1))
    
    # Case 2: Starts with cm (e.g., "100 cm (3 ft 3 in)")
    match_cm = re.search(r"([\d\.]+)\s*cm", value)
    if match_cm:
        cm_value = float(match_cm.group(1))
        return cm_value / 2.54  # convert cm → inches
    
    return np.nan

df['Height_restriction_in'] = df['Height restriction'].apply(extract_inches)

df.drop(columns=['Length', 'Height restriction'], inplace=True)
df.head()

Unnamed: 0,coaster_name,Location,Status,Manufacturer,Model,Inversions,Lift/launch system,Cost,Trains,Duration,...,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions_clean,Gforce_clean,Length_ft,Height_restriction_in
0,Switchback Railway,Coney Island,Removed,LaMarcus Adna Thompson,Lift Packed,,gravity,,,1:00,...,40.574,-73.978,Wood,1884-06-16,6.0,,0,2.9,600.0,
1,Flip Flap Railway,Sea Lion Park,Removed,Lina Beecher,,1.0,,,a single car. Riders are arranged 1 across in ...,,...,40.578,-73.979,Wood,1895-01-01,,,1,12.0,,
2,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",Closed,,,,,,,,...,41.58,-81.57,Other,,,,0,,,
3,Loop the Loop (Coney Island),Other,Removed,Edwin Prescott,,1.0,,,a single car. Riders are arranged 2 across in ...,,...,40.5745,-73.978,Steel,1901-01-01,,,1,,,
4,Loop the Loop (Young's Pier),Other,Removed,Edwin Prescott,,1.0,,,,,...,39.3538,-74.4342,Steel,1901-01-01,,,1,,,


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   coaster_name           1087 non-null   object 
 1   Location               1087 non-null   object 
 2   Status                 874 non-null    object 
 3   Manufacturer           1028 non-null   object 
 4   Model                  744 non-null    object 
 5   Inversions             932 non-null    float64
 6   Lift/launch system     795 non-null    object 
 7   Cost                   382 non-null    object 
 8   Trains                 718 non-null    object 
 9   Duration               765 non-null    object 
 10  Capacity               575 non-null    object 
 11  Designer               578 non-null    object 
 12  Max vertical angle     357 non-null    object 
 13  Drop                   494 non-null    object 
 14  Track layout           335 non-null    object 
 15  Them

In [12]:
df['Cost'].unique()[:50]

array([nan, '$50,000', '$75,000', '$140,000', '$50,000 USD',
       '£25,000 (1922)', 'USD$50,000', '$125,000', 'less than $20,000',
       '$176,000 CDN', '$250,000', '$175,000', '$200,000',
       '48 million FIM (1951);[1]1.62 million EUR (in 2021 euros, inflation-adjusted)[2]',
       '$1,000,000', '$1,200,000 USD', '$1,200,000', '$1,750,000 USD',
       '$500,000', '$400,000',
       'US$2 million($9.1\xa0million in 2020 dollars[1])', '$2,000,000',
       '$3,000,000 (1976)', '$1,600,000', '$1.35 million', '$3 million',
       '£813,000', '$2.3 million', '$3,400,000', '$5,000,000[1]',
       '$2.8 million', '$3.2 million', '£1,000,000', '$2,100,000',
       'US$2.5 million – US$3 million', '$10 million', '$3,800,000',
       '$1.2M est.', '€ 9,4 million', '$2.7M est.', '1.2M',
       'A$3.3 million', '$6.5 million', '2.5 Million', 'US$10 million',
       'CA$9 million[1]', 'USD $150,000', '$6,000,000',
       '£1 million Rebuild', '3,000,000 USD'], dtype=object)

In [13]:
# Clean cost
def clean_cost(value):
    if pd.isna(value):
        return np.nan

    s = str(value).strip().lower()

    # Normalize commas to dots (for decimals)
    s = s.replace(',', '.')

    # Remove annotations and misc text
    s = re.sub(r'\[.*?\]|\(.*?\)|;|–|—', ' ', s)
    s = re.sub(r'est\.?|rebuild|less than|approximately|about|around', '', s)

    # Handle ranges like "2.5 million – 3 million" → take average
    range_match = re.findall(r'([\d\.]+)\s*(?:m|million)?', s)
    if len(range_match) >= 2:
        valid_nums = []
        for x in range_match[:2]:
            # only keep if there's at least one digit
            if re.search(r'\d', x):
                # handle formats like 1.000.000
                if x.count('.') > 1:
                    x = x.replace('.', '')
                try:
                    valid_nums.append(float(x))
                except ValueError:
                    continue
        if valid_nums:
            avg_val = np.mean(valid_nums)
            s = f'{avg_val} million'

    # Detect currency
    if '£' in s or 'gbp' in s:
        currency = 'GBP'
    elif '€' in s or 'eur' in s:
        currency = 'EUR'
    elif 'a$' in s or 'aud' in s:
        currency = 'AUD'
    elif 'ca$' in s or 'cdn' in s or 'cad' in s:
        currency = 'CAD'
    elif 'fim' in s:
        currency = 'FIM'
    else:
        currency = 'USD'

    # Remove everything except digits, dots, and 'm'
    s = re.sub(r'[^0-9\.m]', '', s)

    # Detect "million" or "M" multiplier
    multiplier = 1
    if 'million' in value.lower() or re.search(r'\d+\.?\d*m', value.lower()):
        multiplier = 1_000_000

    # Handle “1.000.000” → “1000000”
    if s.count('.') > 1 and multiplier == 1:
        s = s.replace('.', '')

    # Extract numeric part
    match = re.search(r'[\d\.]+', s)
    if not match or not re.search(r'\d', match.group()):
        return np.nan

    try:
        num = float(match.group()) * multiplier
    except ValueError:
        return np.nan

    # Currency conversion (approximate rates)
    conversion_rates = {
        'USD': 1.0,
        'GBP': 1.25,
        'EUR': 1.1,
        'CAD': 0.73,
        'AUD': 0.65,
        'FIM': 0.19 / 1.1,
    }

    rate = conversion_rates.get(currency, 1.0)
    num_usd = num * rate

    return num_usd

df['Cost_USD'] = df['Cost'].apply(clean_cost)
df.drop(columns=['Cost'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   coaster_name           1087 non-null   object 
 1   Location               1087 non-null   object 
 2   Status                 874 non-null    object 
 3   Manufacturer           1028 non-null   object 
 4   Model                  744 non-null    object 
 5   Inversions             932 non-null    float64
 6   Lift/launch system     795 non-null    object 
 7   Trains                 718 non-null    object 
 8   Duration               765 non-null    object 
 9   Capacity               575 non-null    object 
 10  Designer               578 non-null    object 
 11  Max vertical angle     357 non-null    object 
 12  Drop                   494 non-null    object 
 13  Track layout           335 non-null    object 
 14  Theme                  44 non-null     object 
 15  Rest

In [14]:
df['Acceleration'].unique()[:50]

array([nan, '0 to 55 mph (0 to 89 km/h) in 3 seconds',
       '0 to 87 km/h (0 to 54 mph) in 4 seconds', '4',
       '0 to 54 mph (0 to 87 km/h) in 4 seconds', '5 mrsn',
       '+2.75g (25 m/s)', '0 to 104 mph in 7 seconds',
       '0 to 160.9 km/h (0 to 100 mph) in 7 seconds',
       '0 to 65\xa0mph (105\xa0km/h) in 4 seconds',
       '0 to 70 mph (0 to 113 km/h) in 3.8 seconds',
       '0 to 40 mph (0 to 64 km/h) in 2 seconds', '0 - 60 in 3 seconds',
       '0 to 60 mph (0 to 97 km/h) in 3.5 seconds',
       '0–53 mph in 2.8 seconds',
       '0 to 57 mph (0 to 92 km/h) in 2.8 seconds', '0 to 45 to 70mph',
       '0 − 55.9 mph in 2.8 seconds',
       '0 to 55 mph (0 to 89 km/h) in 4.5 seconds',
       '0 to 180 km/h (0 to 112 mph) in 1.6 seconds',
       '0-80 in 1.8 seconds', '0-60 mph in 4.0 seconds',
       '0 − 82\xa0mph (132\xa0km/h) in 2.3 seconds',
       '0 to 120 mph (0 to 193 km/h) in 4 seconds',
       '"0 to 46.6 mph (0 to 75 km/h) in 3 seconds".',
       '0 to 72 mph (0 t