In [2]:
import pandas as pd
import plotly.express as px

# Load the dataset (change the file path as needed)
df = pd.read_csv('../coaster_db.csv')

In [3]:
display(df.info())
display(df.head())
display(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   coaster_name                   1087 non-null   object 
 1   Length                         953 non-null    object 
 2   Speed                          937 non-null    object 
 3   Location                       1087 non-null   object 
 4   Status                         874 non-null    object 
 5   Opening date                   837 non-null    object 
 6   Type                           1087 non-null   object 
 7   Manufacturer                   1028 non-null   object 
 8   Height restriction             831 non-null    object 
 9   Model                          744 non-null    object 
 10  Height                         965 non-null    object 
 11  Inversions                     932 non-null    float64
 12  Lift/launch system             795 non-null    o

None

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),6 mph (9.7 km/h),Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,,Lift Packed,...,6 mph,9.7 km/h,6.0,mph,6.0,50.0,ft,,0,2.9
1,Flip Flap Railway,,,Sea Lion Park,Removed,1895,Wood,Lina Beecher,,,...,,,,,,,,,1,12.0
2,Switchback Railway (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,...,,,,,,,,,0,
3,Loop the Loop (Coney Island),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,
4,Loop the Loop (Young's Pier),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,


Unnamed: 0,Inversions,year_introduced,latitude,longitude,speed1_value,speed_mph,height_value,height_ft,Inversions_clean,Gforce_clean
count,932.0,1087.0,812.0,812.0,937.0,937.0,965.0,171.0,1087.0,362.0
mean,1.54721,1994.986201,38.373484,-41.595373,53.850374,48.617289,89.575171,101.996491,1.326587,3.824006
std,2.114073,23.475248,15.516596,72.285227,23.385518,16.678031,136.246444,67.329092,2.030854,0.989998
min,0.0,1884.0,-48.2617,-123.0357,5.0,5.0,4.0,13.1,0.0,0.8
25%,0.0,1989.0,35.03105,-84.5522,40.0,37.3,44.0,51.8,0.0,3.4
50%,0.0,2000.0,40.2898,-76.6536,50.0,49.7,79.0,91.2,0.0,4.0
75%,3.0,2010.0,44.7996,2.7781,63.0,58.0,113.0,131.2,2.0,4.5
max,14.0,2022.0,63.2309,153.4265,240.0,149.1,3937.0,377.3,14.0,12.0


In [4]:
df = df[['coaster_name',   #data_subsetting
    #'Length', 'Speed', 
    'Location', 'Status', 
    #'Opening date',
    #'Type',
    'Manufacturer', 
    #'Height restriction', 'Model', 'Height',
    #'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
    #'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
    #'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
    #'Track layout', 'Fastrack available', 'Soft opening date.1',
    #'Closing date', 'Opened', 'Replaced by', 'Website',
    #'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
    #'Single rider line available', 'Restraint Style',
    #'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
    'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 
    #'speed1', 'speed2', 'speed1_value', 'speed1_unit',
    'speed_mph', 
    #'height_value', 'height_unit', 
    'height_ft',
       'Inversions_clean', 'Gforce_clean']].copy()

In [5]:
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean']) 

In [6]:
df= df.rename(columns={'coaster_name':'Coaster_Name',
                  'year_introduced':'Year_Introduced',
                  'opening_date_clean':'Opening_Date',
                  'speed_mph':'Speed_mph',
                  'height_ft':'Height_ft',
                  'Inversions_clean':'Inversions',
                  'Gforce_clean':'Gforce'})

In [7]:
#missing value
df.isna().sum()

Coaster_Name         0
Location             0
Status             213
Manufacturer        59
Year_Introduced      0
latitude           275
longitude          275
Type_Main            0
Opening_Date       250
Speed_mph          150
Height_ft          916
Inversions           0
Gforce             725
dtype: int64

In [8]:

year_counts = df['Year_Introduced'].value_counts().head(10).reset_index()
year_counts.columns = ['Year Introduced', 'Count']

# Create the histogram using Plotly Express
fig = px.bar(year_counts, x='Year Introduced', y='Count',
             title='Top Years Coasters Introduced')
fig.update_layout(xaxis_title='Year Introduced', yaxis_title='Count')
fig.show()

In [9]:
#to have a clear view of distributions
fig = px.histogram(df, x='Speed_mph', nbins=20, title='Coasters Speed (mph)')
fig.update_layout(xaxis_title='Speed (mph)', yaxis_title='Count')
fig.show()

In [10]:
#scatterplot
fig = px.scatter(df, x='Speed_mph', y='Height_ft', title='Coaster Speed vs. Height')
fig.show()