In [7]:
import pandas as pd
import plotly.express as px

# Load the data
data = pd.read_csv('Data/Airline_Passenger_Satisfaction.csv')

# Calculate the count of each satisfaction level
satisfaction_counts = data['satisfaction'].value_counts()

# Convert the counts to a DataFrame for Plotly
df_satisfaction = pd.DataFrame({'Satisfaction': satisfaction_counts.index, 'Counts': satisfaction_counts.values})

# Create the pie chart
fig = px.pie(df_satisfaction, values='Counts', names='Satisfaction', title='Customer Satisfaction Levels',
             color_discrete_sequence=px.colors.sequential.RdBu, 
             hover_data={'Counts': True})  # Shows count on hover

# Improve the hover template to show percentages as well
fig.update_traces(textposition='inside', textinfo='percent+label',
                  hovertemplate='Satisfaction: %{label}<br>Count: %{value}<br>Percentage: %{percent}')

# Show the plot
fig.show()


In [8]:
import pandas as pd

# Load the data
data = pd.read_csv('Data/Airline_Passenger_Satisfaction.csv')

# Calculate the mode of the satisfaction column
mode_satisfaction = data['satisfaction'].mode()

# Calculate frequency distribution
frequency_distribution = data['satisfaction'].value_counts()

# Print the results
print("Mode of Satisfaction:")
print(mode_satisfaction)
print("\nFrequency Distribution of Satisfaction:")
print(frequency_distribution)


Mode of Satisfaction:
0    neutral or dissatisfied
Name: satisfaction, dtype: object

Frequency Distribution of Satisfaction:
satisfaction
neutral or dissatisfied    14573
satisfied                  11403
Name: count, dtype: int64


In [10]:
# Crosstab to see the satisfaction distribution across different classes
class_satisfaction = pd.crosstab(data['Class'], data['satisfaction'], normalize='index') * 100

print("\nSatisfaction by Flight Class (Percentage):")
print(class_satisfaction)



Satisfaction by Flight Class (Percentage):
satisfaction  neutral or dissatisfied  satisfied
Class                                           
Business                    30.484194  69.515806
Eco                         80.612245  19.387755
Eco Plus                    75.221701  24.778299


In [11]:
# Satisfaction by type of travel
travel_type_satisfaction = pd.crosstab(data['Type of Travel'], data['satisfaction'], normalize='index') * 100

print("\nSatisfaction by Type of Travel (Percentage):")
print(travel_type_satisfaction)



Satisfaction by Type of Travel (Percentage):
satisfaction     neutral or dissatisfied  satisfied
Type of Travel                                     
Business travel                41.179732  58.820268
Personal Travel                90.010078   9.989922


In [12]:
# Bin ages into groups
data['Age Group'] = pd.cut(data['Age'], bins=[0, 18, 35, 50, 65, 100], labels=['0-18', '19-35', '36-50', '51-65', '65+'])

# Satisfaction by age group
age_group_satisfaction = pd.crosstab(data['Age Group'], data['satisfaction'], normalize='index') * 100

print("\nSatisfaction by Age Group (Percentage):")
print(age_group_satisfaction)



Satisfaction by Age Group (Percentage):
satisfaction  neutral or dissatisfied  satisfied
Age Group                                       
0-18                        80.981027  19.018973
19-35                       62.575403  37.424597
36-50                       46.123373  53.876627
51-65                       48.539017  51.460983
65+                         80.943215  19.056785


In [13]:
# Filter data by satisfaction
satisfied_customers = data[data['satisfaction'] == 'satisfied']
dissatisfied_customers = data[data['satisfaction'] == 'neutral or dissatisfied']

# Basic statistics for 'Flight Distance' for satisfied vs. dissatisfied customers
print("\nAverage Flight Distance for Satisfied Customers:", satisfied_customers['Flight Distance'].mean())
print("Average Flight Distance for Dissatisfied Customers:", dissatisfied_customers['Flight Distance'].mean())

# Departure and Arrival Delays
print("\nAverage Departure Delay for Satisfied Customers:", satisfied_customers['Departure Delay in Minutes'].mean())
print("Average Departure Delay for Dissatisfied Customers:", dissatisfied_customers['Departure Delay in Minutes'].mean())
print("Average Arrival Delay for Satisfied Customers:", satisfied_customers['Arrival Delay in Minutes'].mean())
print("Average Arrival Delay for Dissatisfied Customers:", dissatisfied_customers['Arrival Delay in Minutes'].mean())



Average Flight Distance for Satisfied Customers: 1527.1657458563536
Average Flight Distance for Dissatisfied Customers: 932.929252727647

Average Departure Delay for Satisfied Customers: 12.12163465754626
Average Departure Delay for Dissatisfied Customers: 16.01537089137446
Average Arrival Delay for Satisfied Customers: 12.150901891772987
Average Arrival Delay for Dissatisfied Customers: 16.76693281938326


In [14]:
import pandas as pd

# Load the data
data = pd.read_csv('Data/Airline_Passenger_Satisfaction.csv')

# Filter data by satisfaction status
satisfied = data[data['satisfaction'] == 'satisfied']
dissatisfied = data[data['satisfaction'] == 'neutral or dissatisfied']

# Function to calculate statistics
def calculate_statistics(data, column_name):
    mean_val = data[column_name].mean()
    median_val = data[column_name].median()
    mode_val = data[column_name].mode()[0]  # mode can be multi-modal, taking the first one
    range_val = data[column_name].max() - data[column_name].min()
    return mean_val, median_val, mode_val, range_val

# Statistics for 'Flight Distance'
stats_flight_distance_satisfied = calculate_statistics(satisfied, 'Flight Distance')
stats_flight_distance_dissatisfied = calculate_statistics(dissatisfied, 'Flight Distance')

# Statistics for 'Departure Delay in Minutes'
stats_departure_delay_satisfied = calculate_statistics(satisfied, 'Departure Delay in Minutes')
stats_departure_delay_dissatisfied = calculate_statistics(dissatisfied, 'Departure Delay in Minutes')

# Statistics for 'Arrival Delay in Minutes'
stats_arrival_delay_satisfied = calculate_statistics(satisfied, 'Arrival Delay in Minutes')
stats_arrival_delay_dissatisfied = calculate_statistics(dissatisfied, 'Arrival Delay in Minutes')

# Print the results
print("Flight Distance Statistics (Satisfied vs Dissatisfied):")
print("Satisfied: Mean = {}, Median = {}, Mode = {}, Range = {}".format(*stats_flight_distance_satisfied))
print("Dissatisfied: Mean = {}, Median = {}, Mode = {}, Range = {}".format(*stats_flight_distance_dissatisfied))

print("\nDeparture Delay Statistics (Satisfied vs Dissatisfied):")
print("Satisfied: Mean = {}, Median = {}, Mode = {}, Range = {}".format(*stats_departure_delay_satisfied))
print("Dissatisfied: Mean = {}, Median = {}, Mode = {}, Range = {}".format(*stats_departure_delay_dissatisfied))

print("\nArrival Delay Statistics (Satisfied vs Dissatisfied):")
print("Satisfied: Mean = {}, Median = {}, Mode = {}, Range = {}".format(*stats_arrival_delay_satisfied))
print("Dissatisfied: Mean = {}, Median = {}, Mode = {}, Range = {}".format(*stats_arrival_delay_dissatisfied))


Flight Distance Statistics (Satisfied vs Dissatisfied):
Satisfied: Mean = 1527.1657458563536, Median = 1237.0, Mode = 337, Range = 4952
Dissatisfied: Mean = 932.929252727647, Median = 679.0, Mode = 337, Range = 4952

Departure Delay Statistics (Satisfied vs Dissatisfied):
Satisfied: Mean = 12.12163465754626, Median = 0.0, Mode = 0, Range = 624
Dissatisfied: Mean = 16.01537089137446, Median = 0.0, Mode = 0, Range = 1128

Arrival Delay Statistics (Satisfied vs Dissatisfied):
Satisfied: Mean = 12.150901891772987, Median = 0.0, Mode = 0.0, Range = 615.0
Dissatisfied: Mean = 16.76693281938326, Median = 0.0, Mode = 0.0, Range = 1115.0


In [15]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load the data
data = pd.read_csv('Data/Airline_Passenger_Satisfaction.csv')

# Filter data by satisfaction status
satisfied = data[data['satisfaction'] == 'satisfied']
dissatisfied = data[data['satisfaction'] == 'neutral or dissatisfied']

# Function to calculate statistics
def calculate_statistics(data, column_name):
    mean_val = data[column_name].mean()
    median_val = data[column_name].median()
    mode_val = data[column_name].mode()[0]  # mode can be multi-modal, taking the first one
    range_val = data[column_name].max() - data[column_name].min()
    return mean_val, median_val, mode_val, range_val

# Statistics and visualizations
def display_statistics_and_plots(column_name):
    # Calculate statistics
    stats_satisfied = calculate_statistics(satisfied, column_name)
    stats_dissatisfied = calculate_statistics(dissatisfied, column_name)
    
    # Print statistics
    print(f"{column_name} Statistics (Satisfied vs Dissatisfied):")
    print("Satisfied: Mean = {:.2f}, Median = {:.2f}, Mode = {:.2f}, Range = {:.2f}".format(*stats_satisfied))
    print("Dissatisfied: Mean = {:.2f}, Median = {:.2f}, Mode = {:.2f}, Range = {:.2f}".format(*stats_dissatisfied))
    
    # Create histograms
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=satisfied[column_name], name='Satisfied', opacity=0.75))
    fig.add_trace(go.Histogram(x=dissatisfied[column_name], name='Dissatisfied', opacity=0.75))
    
    # Update layout
    fig.update_layout(
        title_text=f'Distribution of {column_name} - Satisfied vs Dissatisfied',
        xaxis_title_text=column_name,
        yaxis_title_text='Count',
        bargap=0.2, # gap between bars of adjacent location coordinates
        bargroupgap=0.1 # gap between bars of the same location coordinate
    )
    
    # Show the plot
    fig.show()

# Display statistics and plots for each variable
display_statistics_and_plots('Flight Distance')
display_statistics_and_plots('Departure Delay in Minutes')
display_statistics_and_plots('Arrival Delay in Minutes')


Flight Distance Statistics (Satisfied vs Dissatisfied):
Satisfied: Mean = 1527.17, Median = 1237.00, Mode = 337.00, Range = 4952.00
Dissatisfied: Mean = 932.93, Median = 679.00, Mode = 337.00, Range = 4952.00


Departure Delay in Minutes Statistics (Satisfied vs Dissatisfied):
Satisfied: Mean = 12.12, Median = 0.00, Mode = 0.00, Range = 624.00
Dissatisfied: Mean = 16.02, Median = 0.00, Mode = 0.00, Range = 1128.00


Arrival Delay in Minutes Statistics (Satisfied vs Dissatisfied):
Satisfied: Mean = 12.15, Median = 0.00, Mode = 0.00, Range = 615.00
Dissatisfied: Mean = 16.77, Median = 0.00, Mode = 0.00, Range = 1115.00


In [17]:
import pandas as pd
import plotly.express as px


## NaN handled by choosing median as the value as a robost method ##

# Load the data
data = pd.read_csv('Data/Airline_Passenger_Satisfaction.csv')

# Handle missing values in 'Arrival Delay in Minutes'
data['Arrival Delay in Minutes'].fillna(data['Arrival Delay in Minutes'].median(), inplace=True)

# Sample the data to make the bubble chart more manageable and clear
sampled_data = data.sample(n=1000, random_state=1)

# Create the bubble chart
fig = px.scatter(sampled_data, 
                 x='Flight Distance', 
                 y='Departure Delay in Minutes', 
                 size='Arrival Delay in Minutes', 
                 color='satisfaction', 
                 hover_name='satisfaction', 
                 size_max=60, 
                 title='Interactive Bubble Chart: Flight Distance vs Departure Delay by Arrival Delay',
                 labels={'Flight Distance': 'Flight Distance (miles)',
                         'Departure Delay in Minutes': 'Departure Delay (min)',
                         'Arrival Delay in Minutes': 'Bubble Size: Arrival Delay (min)'})

# Improve layout
fig.update_layout(
    xaxis_title="Flight Distance (miles)",
    yaxis_title="Departure Delay (minutes)",
    legend_title="Customer Satisfaction"
)

# Show the plot
fig.show()
