In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('max_columns', 200)

In [ ]:
df = pd.read_csv('/Users/boczagaba/PycharmProjects/data_analysis/coaster_db.csv')

Looking at the Data

In [ ]:
df.shape

In [ ]:
df.head(5)

In [ ]:
df.columns

In [ ]:
df.dtypes

In [ ]:
df.describe()

Cleaning the Data

In [ ]:
df = df[['coaster_name',
    'Location', 'Status',
    'Manufacturer',
    'year_introduced',
    'latitude', 'longitude',
    'Type_Main',
    'opening_date_clean',
    'speed_mph',
    'height_ft',
    'Inversions_clean', 'Gforce_clean']].copy()

In [ ]:
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

In [ ]:
df = df.rename(columns={'coaster_name':'Coaster_Name',
                   'year_introduced':'Year_Introduced',
                   'opening_date_clean':'Opening_Date',
                   'speed_mph':'Speed_mph',
                   'height_ft':'Height_ft',
                   'Inversions_clean':'Inversions',
                   'Gforce_clean':'Gforce'})

In [ ]:
df.isna().sum()

In [ ]:
df.loc[df.duplicated()]

In [ ]:
df.loc[df.duplicated(subset=['Coaster_Name'])].head(5)

In [ ]:
df.query('Coaster_Name == "Crystal Beach Cyclone"')

In [ ]:
df.columns

In [ ]:
df = df.loc[~df.duplicated(subset=['Coaster_Name','Location','Opening_Date'])] \
    .reset_index(drop=True).copy()

Univariate Analysis

In [ ]:
df['Year_Introduced'].value_counts()

In [ ]:
ax = df['Year_Introduced'].value_counts() \
    .head(10) \
    .plot(kind='bar', title='Top 10 Years Coasters Introduced')
ax.set_xlabel('Year Introduced')
ax.set_ylabel('Count')

In [ ]:
ax = df['Speed_mph'].plot(kind='hist',
                          bins=20,
                          title='Coaster Speed (mph)')
ax.set_xlabel('Speed (mph)')

In [ ]:
ax = df['Speed_mph'].plot(kind='kde',
                          title='Coaster Speed (mph)')
ax.set_xlabel('Speed (mph)')

In [ ]:
df['Type_Main'].value_counts()

Getting insight from relationships between variables

In [ ]:
df.plot(kind='scatter',
        x='Speed_mph',
        y='Height_ft',
        title='Coaster Speed vs. Height')
plt.show()

In [ ]:
ax = sns.scatterplot(x='Speed_mph',
                y='Height_ft',
                hue='Year_Introduced',
                data=df)
ax.set_title('Coaster Speed vs. Height')
plt.show()

In [ ]:
sns.pairplot(df,
             vars=['Year_Introduced','Speed_mph',
                   'Height_ft','Inversions','Gforce'],
            hue='Type_Main')
plt.show()

In [ ]:
df_corr = df[['Year_Introduced','Speed_mph',
    'Height_ft','Inversions','Gforce']].dropna().corr()
df_corr

In [ ]:
sns.heatmap(df_corr, annot=True)

Asking questions

What are the locations with the fastest roller coasters (minimum of 10)?

In [ ]:
ax = df.query('Location != "Other"') \
    .groupby('Location')['Speed_mph'] \
    .agg(['mean','count']) \
    .query('count >= 10') \
    .sort_values('mean')['mean'] \
    .plot(kind='barh', figsize=(12, 5), title='Average Coast Speed by Location')
ax.set_xlabel('Average Coaster Speed')
plt.show()