In [1]:
import pandas as pd
import plotly.express as px


In [2]:
titanic = pd.read_csv('../data/titanic.csv', index_col=0)
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


Fare vs. Age Scatter Plot: A scatter plot can illustrate the relationship between the fare paid and the age of the passengers, with a possible differentiation by survival status.

In [4]:
# Create a scatter plot for Age vs. Fare
fig = px.scatter(titanic, x='Age', y='Fare', color='Survived',
                 title='Age vs. Fare Paid Scatter Plot',
                 labels={'Fare': 'Fare ($)', 'Age': 'Age (years)'}, 
                 hover_data=['Name', 'Pclass', 'Sex'])

# Show the plot
fig.show()

Fare Paid Distribution:
A histogram or boxplot could represent the distribution of fares paid by passengers, which can also be segmented by class or survival status.

In [18]:
# Create a histogram for Fare distribution segmented by Pclass
fig = px.histogram(titanic, x='Fare', color='Pclass',
                   title='Fare Distribution by Passenger Class',
                   labels={'Fare': 'Fare ($)', 'Pclass': 'Passenger Class'},
                   barmode='overlay',  # Overlay the histograms to compare classes
                   nbins=30,  # You can adjust the number of bins for more/less granularity
                   opacity=0.75)  # Set the opacity to see overlapping bars

# Show the plot
fig.show()

Create a boxplot to visualize the distribution of fares, segmented by survival status

In [19]:

# Create a boxplot for Fare distribution segmented by Survived
fig = px.box(titanic, x='Survived', y='Fare',
             title='Fare Distribution by Survival Status',
             labels={'Fare': 'Fare ($)', 'Survived': 'Survival Status'},
             color='Survived',  # Use color to differentiate between survived and not survived
             notched=True)  # Adding notches to indicate confidence interval around the median

# Show the plot
fig.show()

Visualize the survival rate by passenger class (Pclass) using Plotly

In [5]:
# Calculate the survival rate by Pclass
survival_rate = titanic.groupby('Pclass')['Survived'].mean().reset_index()

# Create a bar chart
fig = px.bar(survival_rate, x='Pclass', y='Survived',
             text='Survived',
             title='Survival Rate by Passenger Class',
             labels={'Survived': 'Survival Rate', 'Pclass': 'Passenger Class'},
             category_orders={"Pclass": [1, 2, 3]}) # Ensuring the passenger classes are in order

# Update the y-axis to show percentages
fig.update_yaxes(tickformat=".1%")

# Show the plot
fig.show()


Survival Rate by Gender:
A bar chart could show the difference in survival rates between males and females.

In [11]:
# Calculate the survival rate by gender
survival_rate_gender = titanic.groupby('Sex')['Survived'].mean().reset_index()

# Create a bar chart
fig = px.bar(survival_rate_gender, x='Sex', y='Survived',
             text='Survived',
             title='Survival Rate by Gender',
             labels={'Survived': 'Survival Rate', 'Sex': 'Gender'})

# Update the y-axis to show percentages
fig.update_yaxes(tickformat=".1%")

# Show the plot
fig.show()


Survival Rate by Age Group:
A bar chart could compare the survival rates of different age groups (children, adults, seniors).

In [12]:
# Define age groups
bins = [0, 12, 18, 60, 120]  # 0-12: Child, 13-18: Teen, 19-60: Adult, 60+: Senior
labels = ['Child', 'Teen', 'Adult', 'Senior']
titanic['AgeGroup'] = pd.cut(titanic['Age'], bins=bins, labels=labels, right=False)

# Calculate the survival rate for each age group
survival_rate_age_group = titanic.groupby('AgeGroup')['Survived'].mean().reset_index()

# Create a bar chart
fig = px.bar(survival_rate_age_group, x='AgeGroup', y='Survived',
             text='Survived',
             title='Survival Rate by Age Group',
             labels={'Survived': 'Survival Rate', 'AgeGroup': 'Age Group'})

# Update the y-axis to show percentages
fig.update_yaxes(tickformat=".1%")

# Show the plot
fig.show()

Creating a histogram to visualize the age distribution of passengers aboard the Titanic

In [6]:
# Drop NaN values from 'Age' column
titanic = titanic.dropna(subset=['Age'])


In [7]:
# Create a histogram to visualize the age distribution
fig = px.histogram(titanic, x='Age',
                   title='Age Distribution of Passengers',
                   labels={'Age': 'Age (years)'},
                   nbins=30, # Adjust this value based on your data distribution and preference
                   marginal='box', # Adds a boxplot to the top of the histogram
                   )

# Show the plot
fig.show()

Embarkation Point Distribution: A pie chart to show the proportion of passengers embarking from different points.

In [8]:
# Count the frequency of each embarkation point
embark_counts = titanic['Embarked'].value_counts().reset_index()
embark_counts.columns = ['Embarked', 'Count']

# Create a pie chart
fig = px.pie(embark_counts, values='Count', names='Embarked',
             title='Embarkation Point Distribution',
             color_discrete_sequence=px.colors.sequential.RdBu,
             labels={'Embarked': 'Embarkation Point', 'Count': 'Number of Passengers'})

# Show the plot
fig.show()

In [9]:
# Fill NaN values with 'Unknown'
titanic['Embarked'].fillna('Unknown', inplace=True)


In [10]:
# Count the frequency of each embarkation point
embark_counts = titanic['Embarked'].value_counts().reset_index()
embark_counts.columns = ['Embarked', 'Count']

# Create a pie chart
fig = px.pie(embark_counts, values='Count', names='Embarked',
             title='Embarkation Point Distribution',
             color_discrete_sequence=px.colors.sequential.RdBu,
             labels={'Embarked': 'Embarkation Point', 'Count': 'Number of Passengers'})

# Show the plot
fig.show()