# Exploratory Data Analysis of coaster_db.csv

This notebook provides a step-by-step exploratory data analysis (EDA) of the roller coaster dataset.

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Optional: for geographical visualization
# import folium

In [2]:
# Load the Dataset
coaster_df = pd.read_csv('../001 - Data Analysis/Data/coaster_db.csv')

In [13]:
# Inspect Data Structure
coaster_df.head()
coaster_df.tail()

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
1082,American Dreier Looping,"3,444 ft (1,050 m)",53 mph (85 km/h),Other,,,Steel,Anton Schwarzkopf,55 in (140 cm),,...,53 mph,85 km/h,53.0,mph,53.0,111.0,ft,,3,4.7
1083,Pantheon (roller coaster),"3,328 ft (1,014 m)",73 mph (117 km/h),Busch Gardens Williamsburg,Under construction,2022,Steel – Launched,Intamin,,Blitz Coaster,...,73 mph,117 km/h,73.0,mph,73.0,178.0,ft,,2,
1084,Tron Lightcycle Power Run,"3,169.3 ft (966.0 m)",59.3[1] mph (95.4 km/h),Other,,"June 16, 2016",Steel – Launched,Vekoma,4[2] ft (122 cm),Motorbike roller coaster,...,59.3 mph,95.4 km/h,59.3,mph,59.3,78.1,ft,,0,4.0
1085,Tumbili,770 ft (230 m),34 mph (55 km/h),Kings Dominion,Under construction,,Steel – 4th Dimension – Wing Coaster,S&S – Sansei Technologies,,4D Free Spin,...,34 mph,55 km/h,34.0,mph,34.0,112.0,ft,,0,
1086,Wonder Woman Flight of Courage,"3,300 ft (1,000 m)",58 mph (93 km/h),Six Flags Magic Mountain,Under construction,2022,Steel – Single-rail,Rocky Mountain Construction,,Raptor – Custom,...,58 mph,93 km/h,58.0,mph,58.0,131.0,ft,,3,


In [5]:
coaster_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   coaster_name                   1087 non-null   object 
 1   Length                         953 non-null    object 
 2   Speed                          937 non-null    object 
 3   Location                       1087 non-null   object 
 4   Status                         874 non-null    object 
 5   Opening date                   837 non-null    object 
 6   Type                           1087 non-null   object 
 7   Manufacturer                   1028 non-null   object 
 8   Height restriction             831 non-null    object 
 9   Model                          744 non-null    object 
 10  Height                         965 non-null    object 
 11  Inversions                     932 non-null    float64
 12  Lift/launch system             795 non-null    o

In [6]:
coaster_df.describe()

Unnamed: 0,Inversions,year_introduced,latitude,longitude,speed1_value,speed_mph,height_value,height_ft,Inversions_clean,Gforce_clean
count,932.0,1087.0,812.0,812.0,937.0,937.0,965.0,171.0,1087.0,362.0
mean,1.54721,1994.986201,38.373484,-41.595373,53.850374,48.617289,89.575171,101.996491,1.326587,3.824006
std,2.114073,23.475248,15.516596,72.285227,23.385518,16.678031,136.246444,67.329092,2.030854,0.989998
min,0.0,1884.0,-48.2617,-123.0357,5.0,5.0,4.0,13.1,0.0,0.8
25%,0.0,1989.0,35.03105,-84.5522,40.0,37.3,44.0,51.8,0.0,3.4
50%,0.0,2000.0,40.2898,-76.6536,50.0,49.7,79.0,91.2,0.0,4.0
75%,3.0,2010.0,44.7996,2.7781,63.0,58.0,113.0,131.2,2.0,4.5
max,14.0,2022.0,63.2309,153.4265,240.0,149.1,3937.0,377.3,14.0,12.0


In [7]:
# DataFrame shape, columns, and data types
print('Shape:', coaster_df.shape)
print('Columns:', coaster_df.columns.tolist())
print('Data Types:')
print(coaster_df.dtypes)

Shape: (1087, 56)
Columns: ['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date', 'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height', 'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section', 'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle', 'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced', 'Track layout', 'Fastrack available', 'Soft opening date.1', 'Closing date', 'Opened', 'Replaced by', 'Website', 'Flash Pass Available', 'Must transfer from wheelchair', 'Theme', 'Single rider line available', 'Restraint Style', 'Flash Pass available', 'Acceleration', 'Restraints', 'Name', 'year_introduced', 'latitude', 'longitude', 'Type_Main', 'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit', 'speed_mph', 'height_value', 'height_unit', 'height_ft', 'Inversions_clean', 'Gforce_clean']
Data Types:
coaster_name                      object
Length                            object
Speed                  

In [8]:
# Handle Missing Values
coaster_df.isnull().sum()

coaster_name                        0
Length                            134
Speed                             150
Location                            0
Status                            213
Opening date                      250
Type                                0
Manufacturer                       59
Height restriction                256
Model                             343
Height                            122
Inversions                        155
Lift/launch system                292
Cost                              705
Trains                            369
Park section                      600
Duration                          322
Capacity                          512
G-force                           725
Designer                          509
Max vertical angle                730
Drop                              593
Soft opening date                 991
Fast Lane available              1018
Replaced                          914
Track layout                      752
Fastrack ava

In [9]:
# Find columns containing 'speed' and show their values
speed_cols = [col for col in coaster_df.columns if 'speed' in col.lower()]
print('Columns containing "speed":', speed_cols)
coaster_df[speed_cols].head()

Columns containing "speed": ['Speed', 'speed1', 'speed2', 'speed1_value', 'speed1_unit', 'speed_mph']


Unnamed: 0,Speed,speed1,speed2,speed1_value,speed1_unit,speed_mph
0,6 mph (9.7 km/h),6 mph,9.7 km/h,6.0,mph,6.0
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,


In [10]:
import folium
from IPython.display import display

# Check if latitude and longitude columns exist
if 'latitude' in coaster_df.columns and 'longitude' in coaster_df.columns:
    m = folium.Map(location=[coaster_df['latitude'].mean(), coaster_df['longitude'].mean()], zoom_start=2)
    for idx, row in coaster_df.dropna(subset=['latitude', 'longitude']).iterrows():
        folium.Marker([row['latitude'], row['longitude']], popup=row.get('Name', '')).add_to(m)
    display(m)
else:
    print('Latitude and longitude columns not found in the dataset.')

In [11]:
# Display roller coaster locations (latitude and longitude)
if 'latitude' in coaster_df.columns and 'longitude' in coaster_df.columns:
    locations = coaster_df[['Name', 'latitude', 'longitude']].dropna(subset=['latitude', 'longitude'])
    print(locations)
else:
    print('Latitude and longitude columns not found in the dataset.')

     Name  latitude  longitude
0     NaN   40.5740   -73.9780
1     NaN   40.5780   -73.9790
2     NaN   41.5800   -81.5700
3     NaN   40.5745   -73.9780
4     NaN   39.3538   -74.4342
...   ...       ...        ...
1076  NaN   32.7640  -117.2224
1079  NaN   28.4088   -81.4633
1080  NaN   28.0339   -82.4231
1081  NaN  -27.9574   153.4263
1083  NaN   37.2339   -76.6426

[812 rows x 3 columns]
